18F · khandelwal · Feb 23, 2015 · Feb 13, 2015 · Feb 13, 2015 · Feb 13, 2015
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,6 @@
+[report]
+omit =
+    */site-packages/*
+    bin/*
+    develop-eggs/*
+    eggs/*
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,11 @@
+utils.py
 *.DS_Store
 tika-app-1.7.jar
 research files/
 *.tiff
 *_new.txt
 config.py
 __pycache__
+test_docs/
+*.pyc
+.coverage
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,12 @@
+language: python
+python:
+  - "3.4"
+before_install:
+    sudo apt-get install poppler-utils
+install:
+  - pip install nose
+  - pip install coveralls
+script:
+  - nosetests --with-coverage
+after_success:
+  - coveralls
diff --git a/README.md b/README.md
@@ -1,15 +1,30 @@
 ### Document Processing Toolkit
 
+[![Coverage Status](https://coveralls.io/repos/18F/doc_processing_toolkit/badge.png)](https://coveralls.io/r/18F/doc_processing_toolkit)
+
 ##### About
 Python library to extract text from PDF, and default to OCR when text extraction fails.
 
 ##### Dependencies
 - [Apache Tika](http://tika.apache.org/)
+- [Ghostscript](http://www.ghostscript.com/)
 - [Tesseract](https://code.google.com/p/tesseract-ocr/)
 
-##### Running the script (currently)
-1. Download tika-app-1.7.jar from [Apache Tika](http://tika.apache.org/) in the main dir.
-2. `brew install ghostscripts`
-3. `brew install tesseract`
-4. Run using doc_process_toolkit.py
+##### Installation
+1. Download tika-app-1.7.jar from [Apache Tika](http://tika.apache.org/)
+2. Mac: `brew install ghostscripts` Ubuntu: `sudo apt-get install ghostscript`
+3. `brew install tesseract` Ubuntu: `sudo apt-get install tesseract-ocr`
+
+##### Usage
+These script assume that Apache Tika server is running in text mode.
+Start Tika Server
+`java -jar tika-app-1.7.jar --server --text --port 9998`
 
+In Python script
+```python
+import doc_process_toolkit
+# To convert all pdfs
+doc_process_toolkit.process_documents("<<Document directory>>")
+# To convert only pdfs that don't have a text file
+doc_process_toolkit.process_documents("<<Document directory>>", skip_finished=True)
+```
diff --git a/config.py.example b/config.py.example
diff --git a/doc_process_toolkit.py b/doc_process_toolkit.py
@@ -1,98 +1,62 @@
 import glob
-import time
+import logging
+import os
 import re
 import subprocess
-import tinys3
-import config
 
-WORDS = re.compile('[A-Za-z]{3,}')
+"""
+The functions below are minimal Python wrappers Ghostscript, Tika, and
+Tesseract. They are intended to simplify converting pdf files into usable text.
+"""
 
+WORDS = re.compile('[A-Za-z]{3,}')
 
-def check_server(port=9998):
-    """ Callback type function to let user know server status """
 
-    pid = get_pid(port=port)
-    if pid:
-        print("Server running on port {0} with pid of {1}".format(port, pid))
-    else:
-        print("Server not running on port {}".format(port))
+def get_doc_length(doc_text):
+    """ Return the length of a document and doc text """
 
+    return len(tuple(WORDS.finditer(doc_text)))
 
-def get_pid(port=9998):
-    """ Checks Tika server's PID """
 
-    server_pid = subprocess.Popen(
-        args=['lsof -i :%s -t' % port],
+def check_for_text(doc_path):
+    """
+    Using `pdffonts` returns True if document has fonts, which in essence
+    means it has text
+    """
+    pdffonts_output = subprocess.Popen(
+        ['pdffonts %s' % doc_path],
+        shell=True,
         stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        shell=True
     )
-    return server_pid.communicate()[0]
-
-
-def server_setup_timer(port=9998):
-    """ Checks for server to start with a loop """
-
-    for i in range(10):
-        time.sleep(1)
-        if get_pid(port):
-            print("Server Started !")
-            break
-        else:
-            print("Waited %s second(s)" % i)
+    if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2:
+        return True
 
 
-def stat_tika_server(port=9998):
-    """ Starts Tika server """
-
-    pid = get_pid(port)
-    if pid:
-        print("Process already running on port %s" % port)
-        return
-    subprocess.Popen(
-        args=['java -jar {0} --server --text --port {1}'.format(
-            config.TIKA_PATH, port)],
-        shell=True)
-    server_setup_timer(port)
-
-
-def stop_tika_server(port=9998):
-    """ Kills Tika server """
-
-    pid = get_pid(port)
-    if pid:
-        subprocess.Popen(
-            args=['kill -9 %s' % pid],
-            stderr=subprocess.STDOUT,
-            shell=True)
-    else:
-        print("Server not running on port %s" % port)
-
-
-def save_document(document, export_path):
-    """ saves the document, maybe add a pip instead of reading with python """
+def save_text(document, export_path=None):
+    """ Reads document text and saves it to specified export path """
 
     with open(export_path, 'w') as f:
-        f.write(document.stdout.read())
+        f.write(document)
 
 
-def convert_img_to_text(img_path, export_path=None):
-    """ Converts image file to ORCed txt file using tesseract """
+def img_to_text(img_path, export_path=None):
+    """ Uses Tesseract OCR to convert tiff image to text file """
 
     if not export_path:
-        export_path = img_path.replace(".tiff", "_new")
+        export_path = img_path.replace(".tiff", '')
 
     document = subprocess.Popen(
         args=['tesseract', img_path, export_path],
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT
     )
     document.communicate()
-    print("Document Converted: %s" % img_path)
+    logging.info("%s converted to text from image", img_path)
+    return export_path
 
 
-def convert_to_img(doc_path, export_path=None):
-    """ Saves pdf file to tiff image before OCR """
+def pdf_to_img(doc_path, export_path=None):
+    """ Converts and saves pdf file to tiff image using Ghostscript"""
 
     if not export_path:
         export_path = doc_path.replace(".pdf", ".tiff")
@@ -106,66 +70,49 @@ def convert_to_img(doc_path, export_path=None):
         stdout=subprocess.PIPE
     )
     process.communicate()
+    logging.info("%s converted to tiff image", doc_path)
     return export_path
 
 
-def convert_to_text(doc_paths, port=9998):
-    """ Converts a document """
-
-    if type(doc_paths) == list:
+def pdf_to_text(doc_path, port=9998):
+    """ Converts a document to text using the Tika server """
 
-        for doc_path in doc_paths:
-            document = subprocess.Popen(
-                args=['nc localhost {0} < {1}'.format(port, doc_path)],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                shell=True)
-            yield document, doc_path
-
-
-def get_doc_length(document):
-    """ Return the length of a document -- note piping to wc might be faster"""
-
-    return len(tuple(WORDS.finditer(document)))
-
-
-def upload_to_s3(doc_path, conn):
-    """ Upload a single document to AWS S3"""
-
-    with open(doc_path, 'rb') as doc:
-        conn.upload(doc_path, doc, config.BUCKET)
-        print("uploaded %s" % doc_path)
+    document = subprocess.Popen(
+        args=['nc localhost {0} < {1}'.format(port, doc_path)],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        shell=True)
+    logging.info("%s converted to text from pdf", doc_path)
+    return document
 
 
-def start_connection():
-    """ Starts tinys3 object to connect to S3 """
+def process_documents(glob_path, port=9998, skip_finished=False):
+    """
+    Converts pdfs to text and uses OCR if the initial attempt fails
+    """
 
-    return tinys3.Connection(config.S3_ACCESS_KEY, config.S3_SECRET_KEY)
+    for doc_path in glob.iglob(glob_path):
+        if os.path.exists(doc_path.replace('.pdf', '.txt')) and skip_finished:
+            logging.info("%s: has already been converted", doc_path)
+        else:
+            extraction_succeeded = None
+            # Check if the document has text
+            if check_for_text(doc_path):
+                doc = pdf_to_text(doc_path=doc_path, port=port)
+                doc_text = doc.stdout.read().decode('utf-8')
+                # Check if text extraction succeeded
+                if get_doc_length(doc_text) > 10:
+                    extraction_succeeded = True
+
+            if extraction_succeeded:
+                save_text(doc_text, doc_path.replace(".pdf", ".txt"))
+            else:
+                img_path = pdf_to_img(doc_path)
+                img_to_text(img_path)
 
 
 if __name__ == '__main__':
 
-    """
-    # Testing the script
-    stat_tika_server()
-    check_server()
-    document_paths = glob.glob("test_docs/*/*.pdf")
-    documents = convert_to_text(document_paths)
-    for document, doc_path in documents:
-        save_document(document, doc_path.replace(".pdf", ".txt"))
-        print("%s converted" % doc_path)
-
-    stop_tika_server()
-    """
-    """
-    # Testing the script for pdf to image to txt
-    document_paths = glob.glob("test_docs/*/*.pdf")[0:1]
-    for doc_path in document_paths:
-        img_path = convert_to_img(doc_path)
-        convert_img_to_text(img_path)
-    """
-
-    conn = start_connection()
-    document_paths = glob.glob("test_docs/*/*.pdf")[3:4]
-    for doc_path in document_paths:
-        upload_to_s3(doc_path, conn=conn)
+    # testing script
+    logging.basicConfig(level=logging.INFO)
+    process_documents('test_docs/*/*.pdf', skip_finished=True)
diff --git a/fixtures/record_no_text.pdf b/fixtures/record_no_text.pdf
diff --git a/fixtures/record_text.pdf b/fixtures/record_text.pdf
diff --git a/requirements.txt b/requirements.txt
diff --git a/test_docs/090004d28024984b/record.json b/test_docs/090004d28024984b/record.json
diff --git a/test_docs/090004d28024984b/record.pdf b/test_docs/090004d28024984b/record.pdf
diff --git a/test_docs/090004d28024984b/record.txt b/test_docs/090004d28024984b/record.txt
diff --git a/test_docs/090004d28024984d/record.json b/test_docs/090004d28024984d/record.json
diff --git a/test_docs/090004d28024984d/record.pdf b/test_docs/090004d28024984d/record.pdf