From b1484462c3ccdcff78eb3a8087f662c825d44262 Mon Sep 17 00:00:00 2001 From: ramirezg Date: Tue, 3 Mar 2015 15:49:54 -0500 Subject: [PATCH 1/3] allowing doc process toolkit to parse other forms of documents --- textextraction/doc_process_toolkit.py | 55 ++++++++++++++++++--------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/textextraction/doc_process_toolkit.py b/textextraction/doc_process_toolkit.py index e2454c7..ded87fd 100644 --- a/textextraction/doc_process_toolkit.py +++ b/textextraction/doc_process_toolkit.py @@ -18,18 +18,23 @@ def get_doc_length(doc_text): return len(tuple(WORDS.finditer(doc_text))) -def check_for_text(doc_path): +def check_for_text(doc_path, extension): """ Using `pdffonts` returns True if document has fonts, which in essence - means it has text + means it has text. If a document is not a pdf automatically returns True. """ - pdffonts_output = subprocess.Popen( - ['pdffonts %s' % doc_path], - shell=True, - stdout=subprocess.PIPE, - ) - if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2: - return True + has_text = False + if extension == '.pdf': + pdffonts_output = subprocess.Popen( + ['pdffonts %s' % doc_path], + shell=True, + stdout=subprocess.PIPE, + ) + if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2: + has_text = True + else: + has_text = True + return has_text def save_text(document, export_path=None): @@ -74,7 +79,7 @@ def pdf_to_img(doc_path, export_path=None): return export_path -def pdf_to_text(doc_path, port=9998): +def doc_to_text(doc_path, port=9998): """ Converts a document to text using the Tika server """ document = subprocess.Popen( @@ -86,33 +91,45 @@ def pdf_to_text(doc_path, port=9998): return document -def process_documents(glob_path, port=9998, skip_finished=False): +def convert_documents(glob_path, skip_converted, port=9998): """ Converts pdfs to text and uses OCR if the initial attempt fails """ for doc_path in glob.iglob(glob_path): - if os.path.exists(doc_path.replace('.pdf', '.txt')) and skip_finished: + extension = '.%s' % doc_path.split('.')[-1].lower() + if os.path.exists(doc_path.replace(extension, '.txt')) and \ + skip_converted: logging.info("%s: has already been converted", doc_path) else: extraction_succeeded = None # Check if the document has text - if check_for_text(doc_path): - doc = pdf_to_text(doc_path=doc_path, port=port) + if check_for_text(doc_path, extension): + doc = doc_to_text(doc_path=doc_path, port=port) doc_text = doc.stdout.read().decode('utf-8') # Check if text extraction succeeded if get_doc_length(doc_text) > 10: extraction_succeeded = True - - if extraction_succeeded: - save_text(doc_text, doc_path.replace(".pdf", ".txt")) - else: + # If extraction fails and doc is .pdf, use ORC + if not extraction_succeeded and extension == '.pdf': img_path = pdf_to_img(doc_path) img_to_text(img_path) + else: + save_text(doc_text, doc_path.replace(extension, ".txt")) + +def process_documents(glob_path_list, skip_converted=True): + """ + Given a list of glob paths, loops through each and converts + documents to text + """ + for glob_path in glob_path_list: + convert_documents(glob_path=glob_path, skip_converted=skip_converted) if __name__ == '__main__': # testing script logging.basicConfig(level=logging.INFO) - process_documents('test_docs/*/*.pdf', skip_finished=True) + process_documents( + ['/test_docs/*/*.pdf' 'test_docs/*/*.xls'], + skip_converted=True) From 0bc1b2cbdafdadd9715ecc9c86be47efa7ff7615 Mon Sep 17 00:00:00 2001 From: ramirezg Date: Tue, 3 Mar 2015 15:56:50 -0500 Subject: [PATCH 2/3] updating readme --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d998d2a..b8e2b13 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Coverage Status](https://coveralls.io/repos/18F/doc_processing_toolkit/badge.png)](https://coveralls.io/r/18F/doc_processing_toolkit) ##### About -Python library to extract text from PDF, and default to OCR when text extraction fails. +Python library to extract text from any file type compatiable with [TIKA](http://tika.apache.org/). It defaults to OCR when text extraction of a PDF file fails. ##### Dependencies - [Apache Tika](http://tika.apache.org/) @@ -23,8 +23,9 @@ Start Tika Server In Python script ```python import doc_process_toolkit -# To convert all pdfs -doc_process_toolkit.process_documents("<>") -# To convert only pdfs that don't have a text file -doc_process_toolkit.process_documents("<>", skip_finished=True) +# To convert all PDF and XLS files +doc_process_toolkit.process_documents(["glob path/*.pdf", "glob path/*.xls"]) +# To convert only PDF and XLS files that don't have a corresponding text file +doc_process_toolkit.process_documents( + ["glob path/*.pdf", "glob path/*.xls"], skip_converted=True) ``` From e938374d020a62f0db4d2128e7650e9087edefcb Mon Sep 17 00:00:00 2001 From: ramirezg Date: Tue, 3 Mar 2015 15:58:59 -0500 Subject: [PATCH 3/3] fixing tests --- tests/test_extraction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_extraction.py b/tests/test_extraction.py index 64c9a61..c91ab26 100644 --- a/tests/test_extraction.py +++ b/tests/test_extraction.py @@ -22,10 +22,10 @@ def test_check_for_text(self): """ doc_path = "tests/fixtures/record_text.pdf" - self.assertTrue(dpt.check_for_text(doc_path)) + self.assertTrue(dpt.check_for_text(doc_path, '.pdf')) doc_path = "tests/fixtures/record_no_text.pdf" - self.assertFalse(dpt.check_for_text(doc_path)) + self.assertFalse(dpt.check_for_text(doc_path, '.pdf')) if __name__ == '__main__': unittest.main()