From b1484462c3ccdcff78eb3a8087f662c825d44262 Mon Sep 17 00:00:00 2001
From: ramirezg <gabriel.ramirez@gsa.gov>
Date: Tue, 3 Mar 2015 15:49:54 -0500
Subject: [PATCH 1/3] allowing doc process toolkit to parse other forms of
 documents

---
 textextraction/doc_process_toolkit.py | 55 ++++++++++++++++++---------
 1 file changed, 36 insertions(+), 19 deletions(-)

diff --git a/textextraction/doc_process_toolkit.py b/textextraction/doc_process_toolkit.py
index e2454c7..ded87fd 100644
--- a/textextraction/doc_process_toolkit.py
+++ b/textextraction/doc_process_toolkit.py
@@ -18,18 +18,23 @@ def get_doc_length(doc_text):
     return len(tuple(WORDS.finditer(doc_text)))
 
 
-def check_for_text(doc_path):
+def check_for_text(doc_path, extension):
     """
     Using `pdffonts` returns True if document has fonts, which in essence
-    means it has text
+    means it has text. If a document is not a pdf automatically returns True.
     """
-    pdffonts_output = subprocess.Popen(
-        ['pdffonts %s' % doc_path],
-        shell=True,
-        stdout=subprocess.PIPE,
-    )
-    if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2:
-        return True
+    has_text = False
+    if extension == '.pdf':
+        pdffonts_output = subprocess.Popen(
+            ['pdffonts %s' % doc_path],
+            shell=True,
+            stdout=subprocess.PIPE,
+        )
+        if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2:
+            has_text = True
+    else:
+        has_text = True
+    return has_text
 
 
 def save_text(document, export_path=None):
@@ -74,7 +79,7 @@ def pdf_to_img(doc_path, export_path=None):
     return export_path
 
 
-def pdf_to_text(doc_path, port=9998):
+def doc_to_text(doc_path, port=9998):
     """ Converts a document to text using the Tika server """
 
     document = subprocess.Popen(
@@ -86,33 +91,45 @@ def pdf_to_text(doc_path, port=9998):
     return document
 
 
-def process_documents(glob_path, port=9998, skip_finished=False):
+def convert_documents(glob_path, skip_converted, port=9998):
     """
     Converts pdfs to text and uses OCR if the initial attempt fails
     """
 
     for doc_path in glob.iglob(glob_path):
-        if os.path.exists(doc_path.replace('.pdf', '.txt')) and skip_finished:
+        extension = '.%s' % doc_path.split('.')[-1].lower()
+        if os.path.exists(doc_path.replace(extension, '.txt')) and \
+                skip_converted:
             logging.info("%s: has already been converted", doc_path)
         else:
             extraction_succeeded = None
             # Check if the document has text
-            if check_for_text(doc_path):
-                doc = pdf_to_text(doc_path=doc_path, port=port)
+            if check_for_text(doc_path, extension):
+                doc = doc_to_text(doc_path=doc_path, port=port)
                 doc_text = doc.stdout.read().decode('utf-8')
                 # Check if text extraction succeeded
                 if get_doc_length(doc_text) > 10:
                     extraction_succeeded = True
-
-            if extraction_succeeded:
-                save_text(doc_text, doc_path.replace(".pdf", ".txt"))
-            else:
+            # If extraction fails and doc is .pdf, use ORC
+            if not extraction_succeeded and extension == '.pdf':
                 img_path = pdf_to_img(doc_path)
                 img_to_text(img_path)
+            else:
+                save_text(doc_text, doc_path.replace(extension, ".txt"))
+
 
+def process_documents(glob_path_list, skip_converted=True):
+    """
+    Given a list of glob paths, loops through each and converts
+    documents to text
+    """
+    for glob_path in glob_path_list:
+        convert_documents(glob_path=glob_path, skip_converted=skip_converted)
 
 if __name__ == '__main__':
 
     # testing script
     logging.basicConfig(level=logging.INFO)
-    process_documents('test_docs/*/*.pdf', skip_finished=True)
+    process_documents(
+        ['/test_docs/*/*.pdf' 'test_docs/*/*.xls'],
+        skip_converted=True)

From 0bc1b2cbdafdadd9715ecc9c86be47efa7ff7615 Mon Sep 17 00:00:00 2001
From: ramirezg <gabriel.ramirez@gsa.gov>
Date: Tue, 3 Mar 2015 15:56:50 -0500
Subject: [PATCH 2/3] updating readme

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index d998d2a..b8e2b13 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 [![Coverage Status](https://coveralls.io/repos/18F/doc_processing_toolkit/badge.png)](https://coveralls.io/r/18F/doc_processing_toolkit)
 
 ##### About
-Python library to extract text from PDF, and default to OCR when text extraction fails.
+Python library to extract text from any file type compatiable with [TIKA](http://tika.apache.org/). It defaults to OCR when text extraction of a PDF file fails.
 
 ##### Dependencies
 - [Apache Tika](http://tika.apache.org/)
@@ -23,8 +23,9 @@ Start Tika Server
 In Python script
 ```python
 import doc_process_toolkit
-# To convert all pdfs
-doc_process_toolkit.process_documents("<<Document directory>>")
-# To convert only pdfs that don't have a text file
-doc_process_toolkit.process_documents("<<Document directory>>", skip_finished=True)
+# To convert all PDF and XLS files
+doc_process_toolkit.process_documents(["glob path/*.pdf", "glob path/*.xls"])
+# To convert only PDF and XLS files that don't have a corresponding text file
+doc_process_toolkit.process_documents(
+    ["glob path/*.pdf", "glob path/*.xls"], skip_converted=True)
 ```

From e938374d020a62f0db4d2128e7650e9087edefcb Mon Sep 17 00:00:00 2001
From: ramirezg <gabriel.ramirez@gsa.gov>
Date: Tue, 3 Mar 2015 15:58:59 -0500
Subject: [PATCH 3/3] fixing tests

---
 tests/test_extraction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_extraction.py b/tests/test_extraction.py
index 64c9a61..c91ab26 100644
--- a/tests/test_extraction.py
+++ b/tests/test_extraction.py
@@ -22,10 +22,10 @@ def test_check_for_text(self):
         """
 
         doc_path = "tests/fixtures/record_text.pdf"
-        self.assertTrue(dpt.check_for_text(doc_path))
+        self.assertTrue(dpt.check_for_text(doc_path, '.pdf'))
 
         doc_path = "tests/fixtures/record_no_text.pdf"
-        self.assertFalse(dpt.check_for_text(doc_path))
+        self.assertFalse(dpt.check_for_text(doc_path, '.pdf'))
 
 if __name__ == '__main__':
     unittest.main()