Skip to content

Commit

Permalink
Added multiprocessing for extraction of plain text
Browse files Browse the repository at this point in the history
  • Loading branch information
sonofmun committed Feb 20, 2018
1 parent 810d314 commit 43f8c9a
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 14 deletions.
40 changes: 26 additions & 14 deletions HookTest/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText
from MyCapytain.common.constants import Mimetypes
from lxml import etree
from multiprocessing.pool import Pool


class Build(object):
Expand All @@ -24,7 +25,7 @@ class Build(object):
:type cites: bool
"""

def __init__(self, path, dest, tar=False, txt=False, cites=False):
def __init__(self, path, dest, tar=False, txt=False, cites=False, workers=3):
"""
:param path: the path to the directory that contains the corpus's data directory
Expand All @@ -37,6 +38,8 @@ def __init__(self, path, dest, tar=False, txt=False, cites=False):
:type txt: bool
:param cites: whether to include the citation string for each of the lowest level citation elements
:type cites: bool
:param workers: the number of processes to use in building plain text
:type workers: int
"""

if path.endswith('/'):
Expand All @@ -50,6 +53,7 @@ def __init__(self, path, dest, tar=False, txt=False, cites=False):
self.tar = tar
self.txt = txt
self.cites = cites
self.workers = workers

def repo_file_list(self):
""" Build the list of XML files for the source repo represented by self.path
Expand Down Expand Up @@ -104,18 +108,26 @@ def plain_text(self):
passing_texts = [x for x in glob('{}data/*/*/*.xml'.format(self.dest)) if '__cts__' not in x]
sys.stdout.write('Extracting Text.\n')
sys.stdout.flush()
for text in passing_texts:
sys.stdout.write('.')
sys.stdout.flush()
interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot())
reffs = interactive_text.getReffs(level=len(interactive_text.citation))
passages = [interactive_text.getTextualNode(passage) for passage in reffs]
plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages]
if self.cites is True:
for i, t in enumerate(plaintext):
plaintext[i] = '#' + reffs[i] + '#\n' + t
with open('{}text/{}.txt'.format(self.dest, text.split('/')[-1].replace('.xml', '')), mode='w') as f:
f.write('\n\n'.join(plaintext))
with Pool(processes=self.workers) as executor:
# Send the tasks in order to the pool
for _ in executor.imap_unordered(self.build_texts, [text for text in passing_texts]):
sys.stdout.write('.')
sys.stdout.flush()

# Required for coverage
executor.close()
executor.join()

def build_texts(self, text):
interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot())
reffs = interactive_text.getReffs(level=len(interactive_text.citation))
passages = [interactive_text.getTextualNode(passage) for passage in reffs]
plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages]
if self.cites is True:
for i, t in enumerate(plaintext):
plaintext[i] = '#' + reffs[i] + '#\n' + t
with open('{}text/{}.txt'.format(self.dest, text.split('/')[-1].replace('.xml', '')), mode='w') as f:
f.write('\n\n'.join(plaintext))

def run(self):
""" creates a new corpus directory containing only the passing text files and their metadata files
Expand Down Expand Up @@ -160,7 +172,7 @@ def cmd(**kwargs):
"""
if kwargs['travis'] is True:
status, message = Travis(path=kwargs['path'], dest=kwargs['dest'], tar=kwargs['tar'],
txt=kwargs['txt'], cites=kwargs['cites']).run()
txt=kwargs['txt'], cites=kwargs['cites'], workers=int(kwargs['workers'])).run()
return status, message
else:
return False, 'You cannot run build on the base class'
5 changes: 5 additions & 0 deletions HookTest/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ def parse_args_build(args):
parser.add_argument("--txt", help="Extract plain text files from the XML files", action="store_true", default=False)
parser.add_argument("--cites", help="Include citation for each passage in the plain text files",
action="store_true", default=False)
parser.add_argument(
"--workers",
help="The number of processes to use for extracting plain text.",
default=3
)

args = parser.parse_args(args)
return args
Expand Down

0 comments on commit 43f8c9a

Please sign in to comment.