Skip to content
This repository has been archived by the owner on Nov 7, 2018. It is now read-only.

Commit

Permalink
Check return code of curl
Browse files Browse the repository at this point in the history
  • Loading branch information
divergentdave committed Apr 7, 2016
1 parent 3cf6d57 commit cd9b4c9
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 9 deletions.
4 changes: 2 additions & 2 deletions tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,12 @@ def test_doc_to_text(self):
doc_path=os.path.join(LOCAL_PATH, 'fixtures/record_text.pdf'))
doc = extractor.doc_to_text()
self.assertTrue(
'Cupcake ipsum dolor sit' in doc.stdout.read().decode('utf-8'))
'Cupcake ipsum dolor sit' in doc.decode('utf-8'))

extractor = TextExtraction(
doc_path=os.path.join(LOCAL_PATH, 'fixtures/record_no_text.pdf'))
doc = extractor.doc_to_text()
self.assertEqual(doc.stdout.read().decode('utf-8').strip('\n'), '')
self.assertEqual(doc.decode('utf-8').strip('\n'), '')

def test_extract(self):
"""
Expand Down
12 changes: 5 additions & 7 deletions textextraction/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,8 @@ def save(self, document, ext):
def doc_to_text(self):
""" Converts a document to text using the Tika server """

document = subprocess.Popen(
document = subprocess.check_output(
args=[self.text_arg_str.format(self.doc_path, self.tika_port)],
stdout=subprocess.PIPE,
shell=True
)
logging.info("%s converted to text from pdf", self.doc_path)
Expand All @@ -53,14 +52,13 @@ def extract_metadata(self):
Extracts metadata using Tika into a json file
"""

metadata = subprocess.Popen(
metadata = subprocess.check_output(
args=[
self.metadata_arg_str.format(
self.doc_path, self.tika_port)],
stdout=subprocess.PIPE,
shell=True
)
self.save(metadata.stdout.read().decode('utf-8'), ext='_metadata.json')
self.save(metadata.decode('utf-8'), ext='_metadata.json')

def extract(self):
"""
Expand All @@ -69,7 +67,7 @@ def extract(self):
check if extraction produces text.
"""
self.extract_metadata()
self.save(self.doc_to_text().stdout.read().decode('utf-8'), ext='.txt')
self.save(self.doc_to_text().decode('utf-8'), ext='.txt')


class PDFTextExtraction(TextExtraction):
Expand Down Expand Up @@ -156,7 +154,7 @@ def extract(self):
if not self.has_text():
needs_ocr = True
else:
doc_text = self.doc_to_text().stdout.read().decode('utf-8')
doc_text = self.doc_to_text().decode('utf-8')
# Determine if extraction suceeded
if self.meets_len_threshold(doc_text):
self.save(doc_text, ext='.txt')
Expand Down

0 comments on commit cd9b4c9

Please sign in to comment.