From 43f8c9abf53533df92072bb604a837111acf9777 Mon Sep 17 00:00:00 2001 From: sonofmun Date: Tue, 20 Feb 2018 16:46:42 +0100 Subject: [PATCH 1/2] Added multiprocessing for extraction of plain text --- HookTest/build.py | 40 ++++++++++++++++++++++++++-------------- HookTest/cmd.py | 5 +++++ 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/HookTest/build.py b/HookTest/build.py index 5656355..0eaa1f2 100644 --- a/HookTest/build.py +++ b/HookTest/build.py @@ -7,6 +7,7 @@ from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText from MyCapytain.common.constants import Mimetypes from lxml import etree +from multiprocessing.pool import Pool class Build(object): @@ -24,7 +25,7 @@ class Build(object): :type cites: bool """ - def __init__(self, path, dest, tar=False, txt=False, cites=False): + def __init__(self, path, dest, tar=False, txt=False, cites=False, workers=3): """ :param path: the path to the directory that contains the corpus's data directory @@ -37,6 +38,8 @@ def __init__(self, path, dest, tar=False, txt=False, cites=False): :type txt: bool :param cites: whether to include the citation string for each of the lowest level citation elements :type cites: bool + :param workers: the number of processes to use in building plain text + :type workers: int """ if path.endswith('/'): @@ -50,6 +53,7 @@ def __init__(self, path, dest, tar=False, txt=False, cites=False): self.tar = tar self.txt = txt self.cites = cites + self.workers = workers def repo_file_list(self): """ Build the list of XML files for the source repo represented by self.path @@ -104,18 +108,26 @@ def plain_text(self): passing_texts = [x for x in glob('{}data/*/*/*.xml'.format(self.dest)) if '__cts__' not in x] sys.stdout.write('Extracting Text.\n') sys.stdout.flush() - for text in passing_texts: - sys.stdout.write('.') - sys.stdout.flush() - interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot()) - reffs = interactive_text.getReffs(level=len(interactive_text.citation)) - passages = [interactive_text.getTextualNode(passage) for passage in reffs] - plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages] - if self.cites is True: - for i, t in enumerate(plaintext): - plaintext[i] = '#' + reffs[i] + '#\n' + t - with open('{}text/{}.txt'.format(self.dest, text.split('/')[-1].replace('.xml', '')), mode='w') as f: - f.write('\n\n'.join(plaintext)) + with Pool(processes=self.workers) as executor: + # Send the tasks in order to the pool + for _ in executor.imap_unordered(self.build_texts, [text for text in passing_texts]): + sys.stdout.write('.') + sys.stdout.flush() + + # Required for coverage + executor.close() + executor.join() + + def build_texts(self, text): + interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot()) + reffs = interactive_text.getReffs(level=len(interactive_text.citation)) + passages = [interactive_text.getTextualNode(passage) for passage in reffs] + plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages] + if self.cites is True: + for i, t in enumerate(plaintext): + plaintext[i] = '#' + reffs[i] + '#\n' + t + with open('{}text/{}.txt'.format(self.dest, text.split('/')[-1].replace('.xml', '')), mode='w') as f: + f.write('\n\n'.join(plaintext)) def run(self): """ creates a new corpus directory containing only the passing text files and their metadata files @@ -160,7 +172,7 @@ def cmd(**kwargs): """ if kwargs['travis'] is True: status, message = Travis(path=kwargs['path'], dest=kwargs['dest'], tar=kwargs['tar'], - txt=kwargs['txt'], cites=kwargs['cites']).run() + txt=kwargs['txt'], cites=kwargs['cites'], workers=int(kwargs['workers'])).run() return status, message else: return False, 'You cannot run build on the base class' \ No newline at end of file diff --git a/HookTest/cmd.py b/HookTest/cmd.py index dad5aab..2eda796 100644 --- a/HookTest/cmd.py +++ b/HookTest/cmd.py @@ -119,6 +119,11 @@ def parse_args_build(args): parser.add_argument("--txt", help="Extract plain text files from the XML files", action="store_true", default=False) parser.add_argument("--cites", help="Include citation for each passage in the plain text files", action="store_true", default=False) + parser.add_argument( + "--workers", + help="The number of processes to use for extracting plain text.", + default=3 + ) args = parser.parse_args(args) return args From 0e659e257c81a5b3a6ffefd4cbe61eb97772d6a1 Mon Sep 17 00:00:00 2001 From: sonofmun Date: Wed, 21 Feb 2018 09:38:43 +0100 Subject: [PATCH 2/2] Updated clone_farsiLit test The failing files had been fixed and so the test, which assumed failing files, failed --- tests/test_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_run.py b/tests/test_run.py index ca5eebd..41e0c13 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -442,7 +442,7 @@ def test_run_clone_farsiLit(self): "--console", "inline", "--verbose", "--scheme", "epidoc" ]) self.assertIn( - ">>> [failed] 3 out of 5 files did not pass the tests\n", logs, + ">>> [success] 0 out of 5 files did not pass the tests\n", logs, "Test conclusion should be printed" ) self.assertSubset(