Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added multiprocessing for extraction of plain text #115

Merged
merged 2 commits into from
Mar 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 26 additions & 14 deletions HookTest/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText
from MyCapytain.common.constants import Mimetypes
from lxml import etree
from multiprocessing.pool import Pool


class Build(object):
Expand All @@ -24,7 +25,7 @@ class Build(object):
:type cites: bool
"""

def __init__(self, path, dest, tar=False, txt=False, cites=False):
def __init__(self, path, dest, tar=False, txt=False, cites=False, workers=3):
"""

:param path: the path to the directory that contains the corpus's data directory
Expand All @@ -37,6 +38,8 @@ def __init__(self, path, dest, tar=False, txt=False, cites=False):
:type txt: bool
:param cites: whether to include the citation string for each of the lowest level citation elements
:type cites: bool
:param workers: the number of processes to use in building plain text
:type workers: int
"""

if path.endswith('/'):
Expand All @@ -50,6 +53,7 @@ def __init__(self, path, dest, tar=False, txt=False, cites=False):
self.tar = tar
self.txt = txt
self.cites = cites
self.workers = workers

def repo_file_list(self):
""" Build the list of XML files for the source repo represented by self.path
Expand Down Expand Up @@ -104,18 +108,26 @@ def plain_text(self):
passing_texts = [x for x in glob('{}data/*/*/*.xml'.format(self.dest)) if '__cts__' not in x]
sys.stdout.write('Extracting Text.\n')
sys.stdout.flush()
for text in passing_texts:
sys.stdout.write('.')
sys.stdout.flush()
interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot())
reffs = interactive_text.getReffs(level=len(interactive_text.citation))
passages = [interactive_text.getTextualNode(passage) for passage in reffs]
plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages]
if self.cites is True:
for i, t in enumerate(plaintext):
plaintext[i] = '#' + reffs[i] + '#\n' + t
with open('{}text/{}.txt'.format(self.dest, text.split('/')[-1].replace('.xml', '')), mode='w') as f:
f.write('\n\n'.join(plaintext))
with Pool(processes=self.workers) as executor:
# Send the tasks in order to the pool
for _ in executor.imap_unordered(self.build_texts, [text for text in passing_texts]):
sys.stdout.write('.')
sys.stdout.flush()

# Required for coverage
executor.close()
executor.join()

def build_texts(self, text):
interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot())
reffs = interactive_text.getReffs(level=len(interactive_text.citation))
passages = [interactive_text.getTextualNode(passage) for passage in reffs]
plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages]
if self.cites is True:
for i, t in enumerate(plaintext):
plaintext[i] = '#' + reffs[i] + '#\n' + t
with open('{}text/{}.txt'.format(self.dest, text.split('/')[-1].replace('.xml', '')), mode='w') as f:
f.write('\n\n'.join(plaintext))

def run(self):
""" creates a new corpus directory containing only the passing text files and their metadata files
Expand Down Expand Up @@ -160,7 +172,7 @@ def cmd(**kwargs):
"""
if kwargs['travis'] is True:
status, message = Travis(path=kwargs['path'], dest=kwargs['dest'], tar=kwargs['tar'],
txt=kwargs['txt'], cites=kwargs['cites']).run()
txt=kwargs['txt'], cites=kwargs['cites'], workers=int(kwargs['workers'])).run()
return status, message
else:
return False, 'You cannot run build on the base class'
5 changes: 5 additions & 0 deletions HookTest/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ def parse_args_build(args):
parser.add_argument("--txt", help="Extract plain text files from the XML files", action="store_true", default=False)
parser.add_argument("--cites", help="Include citation for each passage in the plain text files",
action="store_true", default=False)
parser.add_argument(
"--workers",
help="The number of processes to use for extracting plain text.",
default=3
)

args = parser.parse_args(args)
return args
Expand Down
2 changes: 1 addition & 1 deletion tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def test_run_clone_farsiLit(self):
"--console", "inline", "--verbose", "--scheme", "epidoc"
])
self.assertIn(
">>> [failed] 3 out of 5 files did not pass the tests\n", logs,
">>> [success] 0 out of 5 files did not pass the tests\n", logs,
"Test conclusion should be printed"
)
self.assertSubset(
Expand Down