From 43f8c9abf53533df92072bb604a837111acf9777 Mon Sep 17 00:00:00 2001
From: sonofmun <mmunson@gcdh.de>
Date: Tue, 20 Feb 2018 16:46:42 +0100
Subject: [PATCH 1/2] Added multiprocessing for extraction of plain text

---
 HookTest/build.py | 40 ++++++++++++++++++++++++++--------------
 HookTest/cmd.py   |  5 +++++
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/HookTest/build.py b/HookTest/build.py
index 5656355..0eaa1f2 100644
--- a/HookTest/build.py
+++ b/HookTest/build.py
@@ -7,6 +7,7 @@
 from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText
 from MyCapytain.common.constants import Mimetypes
 from lxml import etree
+from multiprocessing.pool import Pool
 
 
 class Build(object):
@@ -24,7 +25,7 @@ class Build(object):
     :type cites: bool
     """
 
-    def __init__(self, path, dest, tar=False, txt=False, cites=False):
+    def __init__(self, path, dest, tar=False, txt=False, cites=False, workers=3):
         """
 
         :param path: the path to the directory that contains the corpus's data directory
@@ -37,6 +38,8 @@ def __init__(self, path, dest, tar=False, txt=False, cites=False):
         :type txt: bool
         :param cites: whether to include the citation string for each of the lowest level citation elements
         :type cites: bool
+        :param workers: the number of processes to use in building plain text
+        :type workers: int
         """
 
         if path.endswith('/'):
@@ -50,6 +53,7 @@ def __init__(self, path, dest, tar=False, txt=False, cites=False):
         self.tar = tar
         self.txt = txt
         self.cites = cites
+        self.workers = workers
 
     def repo_file_list(self):
         """ Build the list of XML files for the source repo represented by self.path
@@ -104,18 +108,26 @@ def plain_text(self):
         passing_texts = [x for x in glob('{}data/*/*/*.xml'.format(self.dest)) if '__cts__' not in x]
         sys.stdout.write('Extracting Text.\n')
         sys.stdout.flush()
-        for text in passing_texts:
-            sys.stdout.write('.')
-            sys.stdout.flush()
-            interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot())
-            reffs = interactive_text.getReffs(level=len(interactive_text.citation))
-            passages = [interactive_text.getTextualNode(passage) for passage in reffs]
-            plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages]
-            if self.cites is True:
-                for i, t in enumerate(plaintext):
-                    plaintext[i] = '#' + reffs[i] + '#\n' + t
-            with open('{}text/{}.txt'.format(self.dest, text.split('/')[-1].replace('.xml', '')), mode='w') as f:
-                f.write('\n\n'.join(plaintext))
+        with Pool(processes=self.workers) as executor:
+            # Send the tasks in order to the pool
+            for _ in executor.imap_unordered(self.build_texts, [text for text in passing_texts]):
+                sys.stdout.write('.')
+                sys.stdout.flush()
+
+            # Required for coverage
+            executor.close()
+            executor.join()
+
+    def build_texts(self, text):
+        interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot())
+        reffs = interactive_text.getReffs(level=len(interactive_text.citation))
+        passages = [interactive_text.getTextualNode(passage) for passage in reffs]
+        plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages]
+        if self.cites is True:
+            for i, t in enumerate(plaintext):
+                plaintext[i] = '#' + reffs[i] + '#\n' + t
+        with open('{}text/{}.txt'.format(self.dest, text.split('/')[-1].replace('.xml', '')), mode='w') as f:
+            f.write('\n\n'.join(plaintext))
 
     def run(self):
         """ creates a new corpus directory containing only the passing text files and their metadata files
@@ -160,7 +172,7 @@ def cmd(**kwargs):
     """
     if kwargs['travis'] is True:
         status, message = Travis(path=kwargs['path'], dest=kwargs['dest'], tar=kwargs['tar'],
-                                 txt=kwargs['txt'], cites=kwargs['cites']).run()
+                                 txt=kwargs['txt'], cites=kwargs['cites'], workers=int(kwargs['workers'])).run()
         return status, message
     else:
         return False, 'You cannot run build on the base class'
\ No newline at end of file
diff --git a/HookTest/cmd.py b/HookTest/cmd.py
index dad5aab..2eda796 100644
--- a/HookTest/cmd.py
+++ b/HookTest/cmd.py
@@ -119,6 +119,11 @@ def parse_args_build(args):
     parser.add_argument("--txt", help="Extract plain text files from the XML files", action="store_true", default=False)
     parser.add_argument("--cites", help="Include citation for each passage in the plain text files",
                         action="store_true", default=False)
+    parser.add_argument(
+        "--workers",
+        help="The number of processes to use for extracting plain text.",
+        default=3
+    )
 
     args = parser.parse_args(args)
     return args

From 0e659e257c81a5b3a6ffefd4cbe61eb97772d6a1 Mon Sep 17 00:00:00 2001
From: sonofmun <mmunson@gcdh.de>
Date: Wed, 21 Feb 2018 09:38:43 +0100
Subject: [PATCH 2/2] Updated clone_farsiLit test

The failing files had been fixed and so the test, which assumed failing files, failed
---
 tests/test_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_run.py b/tests/test_run.py
index ca5eebd..41e0c13 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -442,7 +442,7 @@ def test_run_clone_farsiLit(self):
             "--console", "inline", "--verbose", "--scheme", "epidoc"
         ])
         self.assertIn(
-            ">>> [failed] 3 out of 5 files did not pass the tests\n", logs,
+            ">>> [success] 0 out of 5 files did not pass the tests\n", logs,
             "Test conclusion should be printed"
         )
         self.assertSubset(