Skip to content
This repository was archived by the owner on Nov 7, 2018. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[report]
omit =
*/site-packages/*
bin/*
develop-eggs/*
eggs/*
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
utils.py
*.DS_Store
tika-app-1.7.jar
research files/
*.tiff
*_new.txt
config.py
__pycache__
test_docs/
*.pyc
.coverage
12 changes: 12 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
language: python
python:
- "3.4"
before_install:
sudo apt-get install poppler-utils
install:
- pip install nose
- pip install coveralls
script:
- nosetests --with-coverage
after_success:
- coveralls
25 changes: 20 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,30 @@
### Document Processing Toolkit

[![Coverage Status](https://coveralls.io/repos/18F/doc_processing_toolkit/badge.png)](https://coveralls.io/r/18F/doc_processing_toolkit)

##### About
Python library to extract text from PDF, and default to OCR when text extraction fails.

##### Dependencies
- [Apache Tika](http://tika.apache.org/)
- [Ghostscript](http://www.ghostscript.com/)
- [Tesseract](https://code.google.com/p/tesseract-ocr/)

##### Running the script (currently)
1. Download tika-app-1.7.jar from [Apache Tika](http://tika.apache.org/) in the main dir.
2. `brew install ghostscripts`
3. `brew install tesseract`
4. Run using doc_process_toolkit.py
##### Installation
1. Download tika-app-1.7.jar from [Apache Tika](http://tika.apache.org/)
2. Mac: `brew install ghostscripts` Ubuntu: `sudo apt-get install ghostscript`
3. `brew install tesseract` Ubuntu: `sudo apt-get install tesseract-ocr`

##### Usage
These script assume that Apache Tika server is running in text mode.
Start Tika Server
`java -jar tika-app-1.7.jar --server --text --port 9998`

In Python script
```python
import doc_process_toolkit
# To convert all pdfs
doc_process_toolkit.process_documents("<<Document directory>>")
# To convert only pdfs that don't have a text file
doc_process_toolkit.process_documents("<<Document directory>>", skip_finished=True)
```
4 changes: 0 additions & 4 deletions config.py.example

This file was deleted.

183 changes: 65 additions & 118 deletions doc_process_toolkit.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,62 @@
import glob
import time
import logging
import os
import re
import subprocess
import tinys3
import config

WORDS = re.compile('[A-Za-z]{3,}')
"""
The functions below are minimal Python wrappers Ghostscript, Tika, and
Tesseract. They are intended to simplify converting pdf files into usable text.
"""

WORDS = re.compile('[A-Za-z]{3,}')

def check_server(port=9998):
""" Callback type function to let user know server status """

pid = get_pid(port=port)
if pid:
print("Server running on port {0} with pid of {1}".format(port, pid))
else:
print("Server not running on port {}".format(port))
def get_doc_length(doc_text):
""" Return the length of a document and doc text """

return len(tuple(WORDS.finditer(doc_text)))

def get_pid(port=9998):
""" Checks Tika server's PID """

server_pid = subprocess.Popen(
args=['lsof -i :%s -t' % port],
def check_for_text(doc_path):
"""
Using `pdffonts` returns True if document has fonts, which in essence
means it has text
"""
pdffonts_output = subprocess.Popen(
['pdffonts %s' % doc_path],
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True
)
return server_pid.communicate()[0]


def server_setup_timer(port=9998):
""" Checks for server to start with a loop """

for i in range(10):
time.sleep(1)
if get_pid(port):
print("Server Started !")
break
else:
print("Waited %s second(s)" % i)
if pdffonts_output.communicate()[0].decode("utf-8").count("\n") > 2:
return True


def stat_tika_server(port=9998):
""" Starts Tika server """

pid = get_pid(port)
if pid:
print("Process already running on port %s" % port)
return
subprocess.Popen(
args=['java -jar {0} --server --text --port {1}'.format(
config.TIKA_PATH, port)],
shell=True)
server_setup_timer(port)


def stop_tika_server(port=9998):
""" Kills Tika server """

pid = get_pid(port)
if pid:
subprocess.Popen(
args=['kill -9 %s' % pid],
stderr=subprocess.STDOUT,
shell=True)
else:
print("Server not running on port %s" % port)


def save_document(document, export_path):
""" saves the document, maybe add a pip instead of reading with python """
def save_text(document, export_path=None):
""" Reads document text and saves it to specified export path """

with open(export_path, 'w') as f:
f.write(document.stdout.read())
f.write(document)


def convert_img_to_text(img_path, export_path=None):
""" Converts image file to ORCed txt file using tesseract """
def img_to_text(img_path, export_path=None):
""" Uses Tesseract OCR to convert tiff image to text file """

if not export_path:
export_path = img_path.replace(".tiff", "_new")
export_path = img_path.replace(".tiff", '')

document = subprocess.Popen(
args=['tesseract', img_path, export_path],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
document.communicate()
print("Document Converted: %s" % img_path)
logging.info("%s converted to text from image", img_path)
return export_path


def convert_to_img(doc_path, export_path=None):
""" Saves pdf file to tiff image before OCR """
def pdf_to_img(doc_path, export_path=None):
""" Converts and saves pdf file to tiff image using Ghostscript"""

if not export_path:
export_path = doc_path.replace(".pdf", ".tiff")
Expand All @@ -106,66 +70,49 @@ def convert_to_img(doc_path, export_path=None):
stdout=subprocess.PIPE
)
process.communicate()
logging.info("%s converted to tiff image", doc_path)
return export_path


def convert_to_text(doc_paths, port=9998):
""" Converts a document """

if type(doc_paths) == list:
def pdf_to_text(doc_path, port=9998):
""" Converts a document to text using the Tika server """

for doc_path in doc_paths:
document = subprocess.Popen(
args=['nc localhost {0} < {1}'.format(port, doc_path)],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True)
yield document, doc_path


def get_doc_length(document):
""" Return the length of a document -- note piping to wc might be faster"""

return len(tuple(WORDS.finditer(document)))


def upload_to_s3(doc_path, conn):
""" Upload a single document to AWS S3"""

with open(doc_path, 'rb') as doc:
conn.upload(doc_path, doc, config.BUCKET)
print("uploaded %s" % doc_path)
document = subprocess.Popen(
args=['nc localhost {0} < {1}'.format(port, doc_path)],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True)
logging.info("%s converted to text from pdf", doc_path)
return document


def start_connection():
""" Starts tinys3 object to connect to S3 """
def process_documents(glob_path, port=9998, skip_finished=False):
"""
Converts pdfs to text and uses OCR if the initial attempt fails
"""

return tinys3.Connection(config.S3_ACCESS_KEY, config.S3_SECRET_KEY)
for doc_path in glob.iglob(glob_path):
if os.path.exists(doc_path.replace('.pdf', '.txt')) and skip_finished:
logging.info("%s: has already been converted", doc_path)
else:
extraction_succeeded = None
# Check if the document has text
if check_for_text(doc_path):
doc = pdf_to_text(doc_path=doc_path, port=port)
doc_text = doc.stdout.read().decode('utf-8')
# Check if text extraction succeeded
if get_doc_length(doc_text) > 10:
extraction_succeeded = True

if extraction_succeeded:
save_text(doc_text, doc_path.replace(".pdf", ".txt"))
else:
img_path = pdf_to_img(doc_path)
img_to_text(img_path)


if __name__ == '__main__':

"""
# Testing the script
stat_tika_server()
check_server()
document_paths = glob.glob("test_docs/*/*.pdf")
documents = convert_to_text(document_paths)
for document, doc_path in documents:
save_document(document, doc_path.replace(".pdf", ".txt"))
print("%s converted" % doc_path)

stop_tika_server()
"""
"""
# Testing the script for pdf to image to txt
document_paths = glob.glob("test_docs/*/*.pdf")[0:1]
for doc_path in document_paths:
img_path = convert_to_img(doc_path)
convert_img_to_text(img_path)
"""

conn = start_connection()
document_paths = glob.glob("test_docs/*/*.pdf")[3:4]
for doc_path in document_paths:
upload_to_s3(doc_path, conn=conn)
# testing script
logging.basicConfig(level=logging.INFO)
process_documents('test_docs/*/*.pdf', skip_finished=True)
Binary file added fixtures/record_no_text.pdf
Binary file not shown.
Binary file added fixtures/record_text.pdf
Binary file not shown.
2 changes: 0 additions & 2 deletions requirements.txt

This file was deleted.

18 changes: 0 additions & 18 deletions test_docs/090004d28024984b/record.json

This file was deleted.

Binary file removed test_docs/090004d28024984b/record.pdf
Binary file not shown.
44 changes: 0 additions & 44 deletions test_docs/090004d28024984b/record.txt

This file was deleted.

18 changes: 0 additions & 18 deletions test_docs/090004d28024984d/record.json

This file was deleted.

Binary file removed test_docs/090004d28024984d/record.pdf
Binary file not shown.
Loading