This repository was archived by the owner on Nov 7, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 25
allowing doc process toolkit to parse other forms of documents #3
Merged
Merged
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,18 +18,23 @@ def get_doc_length(doc_text): | |
return len(tuple(WORDS.finditer(doc_text))) | ||
|
||
|
||
def check_for_text(doc_path): | ||
def check_for_text(doc_path, extension): | ||
""" | ||
Using `pdffonts` returns True if document has fonts, which in essence | ||
means it has text | ||
means it has text. If a document is not a pdf automatically returns True. | ||
""" | ||
pdffonts_output = subprocess.Popen( | ||
['pdffonts %s' % doc_path], | ||
shell=True, | ||
stdout=subprocess.PIPE, | ||
) | ||
if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2: | ||
return True | ||
has_text = False | ||
if extension == '.pdf': | ||
pdffonts_output = subprocess.Popen( | ||
['pdffonts %s' % doc_path], | ||
shell=True, | ||
stdout=subprocess.PIPE, | ||
) | ||
if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2: | ||
has_text = True | ||
else: | ||
has_text = True | ||
return has_text | ||
|
||
|
||
def save_text(document, export_path=None): | ||
|
@@ -74,7 +79,7 @@ def pdf_to_img(doc_path, export_path=None): | |
return export_path | ||
|
||
|
||
def pdf_to_text(doc_path, port=9998): | ||
def doc_to_text(doc_path, port=9998): | ||
""" Converts a document to text using the Tika server """ | ||
|
||
document = subprocess.Popen( | ||
|
@@ -86,33 +91,45 @@ def pdf_to_text(doc_path, port=9998): | |
return document | ||
|
||
|
||
def process_documents(glob_path, port=9998, skip_finished=False): | ||
def convert_documents(glob_path, skip_converted, port=9998): | ||
""" | ||
Converts pdfs to text and uses OCR if the initial attempt fails | ||
""" | ||
|
||
for doc_path in glob.iglob(glob_path): | ||
if os.path.exists(doc_path.replace('.pdf', '.txt')) and skip_finished: | ||
extension = '.%s' % doc_path.split('.')[-1].lower() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What you really want here is this: https://docs.python.org/3/library/os.path.html#os.path.splitext |
||
if os.path.exists(doc_path.replace(extension, '.txt')) and \ | ||
skip_converted: | ||
logging.info("%s: has already been converted", doc_path) | ||
else: | ||
extraction_succeeded = None | ||
# Check if the document has text | ||
if check_for_text(doc_path): | ||
doc = pdf_to_text(doc_path=doc_path, port=port) | ||
if check_for_text(doc_path, extension): | ||
doc = doc_to_text(doc_path=doc_path, port=port) | ||
doc_text = doc.stdout.read().decode('utf-8') | ||
# Check if text extraction succeeded | ||
if get_doc_length(doc_text) > 10: | ||
extraction_succeeded = True | ||
|
||
if extraction_succeeded: | ||
save_text(doc_text, doc_path.replace(".pdf", ".txt")) | ||
else: | ||
# If extraction fails and doc is .pdf, use ORC | ||
if not extraction_succeeded and extension == '.pdf': | ||
img_path = pdf_to_img(doc_path) | ||
img_to_text(img_path) | ||
else: | ||
save_text(doc_text, doc_path.replace(extension, ".txt")) | ||
|
||
|
||
def process_documents(glob_path_list, skip_converted=True): | ||
""" | ||
Given a list of glob paths, loops through each and converts | ||
documents to text | ||
""" | ||
for glob_path in glob_path_list: | ||
convert_documents(glob_path=glob_path, skip_converted=skip_converted) | ||
|
||
if __name__ == '__main__': | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be in a unit test somewhere? Instead in the main here? |
||
|
||
# testing script | ||
logging.basicConfig(level=logging.INFO) | ||
process_documents('test_docs/*/*.pdf', skip_finished=True) | ||
process_documents( | ||
['/test_docs/*/*.pdf' 'test_docs/*/*.xls'], | ||
skip_converted=True) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't a simpler check be to try and extract the text from a document and see if that returns anything?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I actually run, 2 checks for text. The first is a quick scan to make sure that text even exists in the document. The second occurs after Tika extracts the document text to make sure that the text extraction worked. I figured it would save some Tika processing time.