Skip to content
This repository was archived by the owner on Nov 7, 2018. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Coverage Status](https://coveralls.io/repos/18F/doc_processing_toolkit/badge.png)](https://coveralls.io/r/18F/doc_processing_toolkit)

##### About
Python library to extract text from PDF, and default to OCR when text extraction fails.
Python library to extract text from any file type compatiable with [TIKA](http://tika.apache.org/). It defaults to OCR when text extraction of a PDF file fails.

##### Dependencies
- [Apache Tika](http://tika.apache.org/)
Expand All @@ -23,8 +23,9 @@ Start Tika Server
In Python script
```python
import doc_process_toolkit
# To convert all pdfs
doc_process_toolkit.process_documents("<<Document directory>>")
# To convert only pdfs that don't have a text file
doc_process_toolkit.process_documents("<<Document directory>>", skip_finished=True)
# To convert all PDF and XLS files
doc_process_toolkit.process_documents(["glob path/*.pdf", "glob path/*.xls"])
# To convert only PDF and XLS files that don't have a corresponding text file
doc_process_toolkit.process_documents(
["glob path/*.pdf", "glob path/*.xls"], skip_converted=True)
```
4 changes: 2 additions & 2 deletions tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ def test_check_for_text(self):
"""

doc_path = "tests/fixtures/record_text.pdf"
self.assertTrue(dpt.check_for_text(doc_path))
self.assertTrue(dpt.check_for_text(doc_path, '.pdf'))

doc_path = "tests/fixtures/record_no_text.pdf"
self.assertFalse(dpt.check_for_text(doc_path))
self.assertFalse(dpt.check_for_text(doc_path, '.pdf'))

if __name__ == '__main__':
unittest.main()
55 changes: 36 additions & 19 deletions textextraction/doc_process_toolkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,23 @@ def get_doc_length(doc_text):
return len(tuple(WORDS.finditer(doc_text)))


def check_for_text(doc_path):
def check_for_text(doc_path, extension):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't a simpler check be to try and extract the text from a document and see if that returns anything?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually run, 2 checks for text. The first is a quick scan to make sure that text even exists in the document. The second occurs after Tika extracts the document text to make sure that the text extraction worked. I figured it would save some Tika processing time.

"""
Using `pdffonts` returns True if document has fonts, which in essence
means it has text
means it has text. If a document is not a pdf automatically returns True.
"""
pdffonts_output = subprocess.Popen(
['pdffonts %s' % doc_path],
shell=True,
stdout=subprocess.PIPE,
)
if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2:
return True
has_text = False
if extension == '.pdf':
pdffonts_output = subprocess.Popen(
['pdffonts %s' % doc_path],
shell=True,
stdout=subprocess.PIPE,
)
if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2:
has_text = True
else:
has_text = True
return has_text


def save_text(document, export_path=None):
Expand Down Expand Up @@ -74,7 +79,7 @@ def pdf_to_img(doc_path, export_path=None):
return export_path


def pdf_to_text(doc_path, port=9998):
def doc_to_text(doc_path, port=9998):
""" Converts a document to text using the Tika server """

document = subprocess.Popen(
Expand All @@ -86,33 +91,45 @@ def pdf_to_text(doc_path, port=9998):
return document


def process_documents(glob_path, port=9998, skip_finished=False):
def convert_documents(glob_path, skip_converted, port=9998):
"""
Converts pdfs to text and uses OCR if the initial attempt fails
"""

for doc_path in glob.iglob(glob_path):
if os.path.exists(doc_path.replace('.pdf', '.txt')) and skip_finished:
extension = '.%s' % doc_path.split('.')[-1].lower()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if os.path.exists(doc_path.replace(extension, '.txt')) and \
skip_converted:
logging.info("%s: has already been converted", doc_path)
else:
extraction_succeeded = None
# Check if the document has text
if check_for_text(doc_path):
doc = pdf_to_text(doc_path=doc_path, port=port)
if check_for_text(doc_path, extension):
doc = doc_to_text(doc_path=doc_path, port=port)
doc_text = doc.stdout.read().decode('utf-8')
# Check if text extraction succeeded
if get_doc_length(doc_text) > 10:
extraction_succeeded = True

if extraction_succeeded:
save_text(doc_text, doc_path.replace(".pdf", ".txt"))
else:
# If extraction fails and doc is .pdf, use ORC
if not extraction_succeeded and extension == '.pdf':
img_path = pdf_to_img(doc_path)
img_to_text(img_path)
else:
save_text(doc_text, doc_path.replace(extension, ".txt"))


def process_documents(glob_path_list, skip_converted=True):
"""
Given a list of glob paths, loops through each and converts
documents to text
"""
for glob_path in glob_path_list:
convert_documents(glob_path=glob_path, skip_converted=skip_converted)

if __name__ == '__main__':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be in a unit test somewhere? Instead in the main here?


# testing script
logging.basicConfig(level=logging.INFO)
process_documents('test_docs/*/*.pdf', skip_finished=True)
process_documents(
['/test_docs/*/*.pdf' 'test_docs/*/*.xls'],
skip_converted=True)