## Extracting Text from PDFs

### Imports

In [1]:
from IPython.core.display import display, HTML, Markdown as md
display(HTML("""<style>.container { width:80% !important; } p, ul {max-width:␣
,→40em;} .rendered_html table { margin-left: 0; } .output_subarea.output_png {␣
,→display: flex; justify-content: center;}</style>"""))
import pandas     as pd
import numpy      as np
import re
from   pyocr      import pyocr
from   pyocr      import builders
import io
import codecs
import os
import shutil
from   PIL        import Image as PI
from   wand.image import Image
from   difflib    import SequenceMatcher  
from   bs4        import BeautifulSoup as bs
import tempfile

### Functions


In [2]:
# Unlock and remove all temp magick files
def removeMagickFiles():
    
    # This doesn't work because imageMagick doesn't release the lock on its temp files 
    
    # Get files from temp folder
    tempFolder = tempfile.gettempdir()
    files      = getMagickFiles(tempFolder)

    # Iterate through them 
    for (path, filename) in files:

        # Get extact file path
        file = os.path.join(path, filename)

        # Delete it
        os.remove(file)

In [3]:
# Get all temp magickfiles
def getMagickFiles(tempFolder):

    # This doesn't work because imageMagick doesn't release the lock on its temp files 
        
    # Create empty List
    files = list()
    
    # Iterate through temp folder and sub folders
    for (dirpath, dirnames, filenames) in os.walk(tempFolder):
        
        # Add magickFiles to filelist
        files += [(dirpath, file) for file in filenames if file.startswith('magick')]

    # Return the list
    return files

In [4]:
def convertPDFtoTextArray(folder, file):
    """
    Takes a PDF document, converts to an image and 
    then extracts text from each page of that image into an array of UTF-8 encoded text.

    """
    pdf_as_image = Image(filename=folder+'/'+file, resolution=600)
    pdf_as_jpeg  = pdf_as_image.convert('jpeg')
    del pdf_as_image
    
    tool         = pyocr.get_available_tools()[0]
    pages_text   = []
    
    for img in pdf_as_jpeg.sequence:
        img_page  = Image(image=img)
        req_image = img_page.make_blob('jpeg')
        txt       = tool.image_to_string(
            
            PI.open(io.BytesIO(req_image)),
            builder = pyocr.builders.TextBuilder()
            
        )
        pages_text.append(txt.encode('utf-8', 'ignore'))
        
    filename = re.sub('.pdf','', file)
    np.save(folder+'/'+filename, pages_text)
    
    removeMagickFiles 
    
    return pages_text

In [5]:
def convertFolderofPDFstotextArrays(folder):
    filelist = os.listdir(folder)
    filelist = [x for x in filelist if x.endswith('pdf')]
    numFiles = len(filelist)
    for i, file in enumerate(filelist):
        print('Converting file',i+1, 'of', numFiles)
        convertPDFtoTextArray(folder, file)

In [6]:
# takes a array of text strings (pages) and cleans docuemnt of headers and footers then cleans up flattened text (i.e. removes ) 
def cleanAndSaveText(folder, 
                     file, 
                     footkeeplist=[], 
                     footdroplist=[], 
                     headkeeplist=[], 
                     headdroplist=[],
                     footthreshold=3, 
                     headthreshold=3, 
                     hiddenFooters=True, 
                     hiddenHeaders=True):
    
    data     = convertTextArrayToText(folder, file, footkeeplist,footdroplist, headkeeplist, headdroplist,
                                  footthreshold, headthreshold, hiddenFooters, hiddenHeaders)
    txtfile  = re.sub('npy', 'txt', file)
    f        = io.open(folder+'/'+txtfile, encoding='utf-8', mode='w')
    doc_text = ' '.join(data) 

    #remove breaks from words that wrap over two lines
    doc_text = re.sub('-\n', '', doc_text) 

    #replace single returns with spaces
    doc_text = re.sub('\n', ' ', doc_text) 

    #replace double spaces with single spaces
    doc_text = re.sub('  ', ' ', doc_text) 

    f.write(doc_text)
    f.close()

In [7]:
# Use for full process from PDF doc to text file 
def convertOnePDFtoText(folder, 
                        file, 
                        footkeeplist=[], 
                        headkeeplist=[], 
                        footthreshold=3, 
                        headthreshold=3, 
                        hiddenFooters=True, 
                        hiddenHeaders=True):
    
    pages_of_text = convertPDFtoTextArray(folder, file)
    pages_of_text = decodeText(pages_of_text)
    pages_of_text = removeFooters(pages_of_text, footkeeplist, footthreshold, hiddenFooters)
    pages_of_text = removeHeaders(pages_of_text, headkeeplist, headthreshold, hiddenHeaders)
    return pages_of_text

In [8]:
# use for turning an already prcessed PDF (i.e. converted to npy file) into text file without headers or footers
def convertTextArrayToText(folder, 
                           file, 
                           footkeeplist=[], 
                           footdroplist=[], 
                           headkeeplist=[],
                           headdroplist=[], 
                           footthreshold=3, 
                           headthreshold=3, 
                           hiddenFooters=True, 
                           hiddenHeaders=True):
    
    pages_of_text = np.load(folder+'/'+file)
    pages_of_text = decodeText(pages_of_text)
    pages_of_text = removeFooters(pages_of_text, footkeeplist, footdroplist, footthreshold, hiddenFooters)
    pages_of_text = removeHeaders(pages_of_text, headkeeplist, headdroplist, headthreshold, hiddenHeaders)
    return pages_of_text

In [9]:
def decodeText(encoded_text):
    decoded_text = [page.decode('utf-8') for page in encoded_text]
    return decoded_text

In [10]:
def removeFooters(text_with_footers, keeplist=[], droplist=[], thresh=2.9, hidden=True):
    possible_footers = findFooters(text_with_footers)
    footer_scores    = scoreHeadersOrFooters (possible_footers)
    probableFooters  = sortScores(footer_scores, threshold=thresh, type='footers')
    new_text         = text_with_footers.copy()

    set_of_footers   = set([x[0][0] for x in probableFooters])
    set_of_footers   = [x for x in set_of_footers if x not in keeplist]
    set_of_footers   = list(set(set_of_footers + droplist))
    new_text         = deleteFooters(new_text, probableFooters)
    
    if hidden ==True:
        new_text   = removeHeadersOrFootersHiddeninText (new_text,set_of_footers)    
    
    return new_text     

In [11]:
def findFooters(pages):
    footer_candidates=[]
    for num, page in enumerate(pages):
        
        lines = []
        start = -1
        end   = -1
        doublereturn = False
        
        for i in range (0,5):

            while end == start:
                start = page.rfind('\n', max(0,end-200), end)
                if start == -1:
                    break
                elif start == end-1:
                    end          = start
                    doublereturn = True
                    
            if start == -1:
                line = ''
                
            if end == -1:
                line = re.sub('\d', '@', page[start+1:])
            elif (doublereturn):
                line = re.sub('\d', '@', page[start+1:end])
            else: line = re.sub('\d', '@', page[start+1:end])

            lines.append([line, start, end])
            end = start

        footer_candidates.append(lines)

    return footer_candidates

In [12]:
def scoreHeadersOrFooters(candidates):
    scores  =[]
    numpages= len(candidates)
    WIN     = 8 #range of pages back and forth to compare
    weights = [1,0.75, 0.5, 0.5, 0.5]
    
    for j in range(0, numpages):
        
        first = max(0, j-WIN)
        last  = min(j+WIN, numpages-1)
        pageScores = []
    
        for i in range (0,5):
            similaritySum = 0
            
            for k in range(first, last):
                
                if j != k:
                    similarity = SequenceMatcher(None, candidates[j][i][0],candidates[k][i][0]).ratio()
                    similaritySum += similarity
                similaritySum = weights[i]*similaritySum
            pageScores.append([candidates[j][i], j, i,similaritySum])
        scores.append(pageScores)
    return scores

In [13]:
def sortScores(scores, threshold=2.5, type = 'headers/footers'):
    deleteList = []
    
    for page in scores:
        
        for line in page:
            if line[3] > threshold:
                deleteList.append(line)
    print('Will delete these', type, ':', deleteList, '\n')
    return deleteList

In [14]:
def removeHeaders(text_with_headers, keeplist=[], droplist=[],thresh=2.9, hidden=True):
    possible_headers = findHeaders(text_with_headers)
    header_scores    = scoreHeadersOrFooters (possible_headers)
    probable_headers = sortScores(header_scores, threshold=thresh, type='headers')
    new_text         = text_with_headers.copy()

    set_of_headers   = set([x[0][0] for x in probable_headers])
    set_of_headers   = [x for x in set_of_headers if x not in keeplist]
    set_of_headers   = list(set(set_of_headers + droplist))
    
    new_text         = deleteHeaders(new_text, probable_headers)
    
    if hidden ==True:
        new_text = removeHeadersOrFootersHiddeninText (new_text,set_of_headers)
    
    return new_text     

In [15]:
def deleteFooters(text_pages, footers_to_delete, keeplist=[]):
    
    for line in footers_to_delete:
        if line[0][0] not in keeplist:
            pagenum = line[1]
            start   = line[0][1]
            while text_pages[pagenum][start]=='\n':
                start -= 1
            end     = line[0][2]
#         print(pagenum, start, end)
        if end == -1:
            text_pages[pagenum] = text_pages[pagenum][:start]
        else:
            text_pages[pagenum] = text_pages[pagenum][:start] + text_pages[pagenum][end:]
    return text_pages

In [16]:
def removeHeadersOrFootersHiddeninText(text_pages, possibleHiddenItems):
    print(possibleHiddenItems, '\n')
    
    for option in possibleHiddenItems:
        
        for i, page in enumerate(text_pages):
            option2 = option
            if page.find(option2) != -1:
                if page.find(option2) == page.rfind(option2):
                    start = page.find(option2)
                    end   = start + len(option2)
                    while start > 0 and page[start-1]== '\n':
                        start -= 1
                    while end + 1 <= len(page) - 1 and page[end+1] == '\n':
                        end += 1
                    print(option2, 'deleted from page', i, 'position', page.find(option2))
                    text_pages[i] = text_pages[i][:start]+' '+text_pages[i][end:]
                    
    return text_pages

In [17]:
def findHeaders(pages):
    header_candidates = []
    for num, page in enumerate(pages):
#         print('page:', num)
        lines = []
        start = 0
        end   = 0
        
        for i in range (0,5):
            
            while end - start < 1:
                end   = page.find('\n', start, start+200)
                if end == -1:
                    break
                elif start == end:
                    start = end + 1
                    
            if end == - 1:
                line = ''
            else:
                line = re.sub('\d', '@', page[start:end])
            lines.append([line, start, end])
            start = end + 1
            
        header_candidates.append(lines)

    return header_candidates

In [18]:
def deleteHeaders(text_pages, headers_to_delete, keeplist=[]):
    for line in headers_to_delete:
        if line[0][0] not in keeplist:
            pagenum = line[1]
            start   = line[0][1]
            end     = line[0][2]+1
            while text_pages[pagenum][end] == '\n':
                end += 1
                
        if start != 0:
            text_pages[pagenum] = text_pages[pagenum][:start]+text_pages[pagenum][end:]
        else:
            text_pages[pagenum] = text_pages[pagenum][end:]
    return text_pages

In [19]:
# Extract the text from a PDF
# Must be passed the path to it's folder and the filename
def extractTextFromPDF(file, tool):

    # Note: The imageMagick functions read the pdf pages to memory. If there is not enough memory allocated to the stack, it writes them to the temp
    #       directory. Unfortunately it also doesn't release or delete the files. So when iterating through lots of PDFs with lots of pages the 
    #       C:\ drive runs out of memory. To get around this we have to use the with x as y functionality of python
    #       In order to do it with many pdfs, as you will see below, it gets a bit convoluted
    #       This still doesn't entirely work, but it helps
    
    # Get the PDF as a JPEG image
    pdf_as_image = Image(filename = file , resolution=600)     
    with pdf_as_image.convert('jpeg') as pdf_as_jpeg:

        # Create the text object we will append text to
        pages_text = []

        # Iterate through the pdf images
        for img in pdf_as_jpeg.sequence:
            with Image(image = img) as img_page:

                req_image = img_page.make_blob('jpeg')

                # Create txt
                with PI.open(io.BytesIO(req_image)) as im:
                    txt = tool.image_to_string(
                        im,
                        builder = builders.TextBuilder()
                    )
                pages_text.append(txt.encode('utf-8', 'ignore'))
                del txt
                del img

    #flatten array of pages into one string/byte for whole document
    doc_text = b' '.join(pages_text) 
    del pages_text
    
    #remove breaks from words that wrap over two lines
    doc_text = re.sub(b'-\n', b'', doc_text) 
    
    #replace single returns with spaces
    doc_text = re.sub(b'\n', b' ', doc_text) 
    
    #replace double spaces with single spaces
    doc_text = re.sub(b'  ', b' ', doc_text) 
    
    #replace double returns with single returns
    doc_text = re.sub(b'\n\n', b'\n', doc_text) 
    
    # Return the text
    return doc_text

In [20]:
# Drops all characters up to the first '-' in the TagTog file name 
def dropGarbage(fileName):
    
    # Temp variable
    temp = fileName
    
    # Get buffer for post index starting point
    if temp[temp.index('-') + 1] == '_':
        buffer = 2
        
    else:
        buffer = 1
        
    # Return from the starting point post index
    return temp[temp.index('-') + buffer:]

### Script

In [21]:
# Set root directories
print('Set root Directories')
root_src_dir = 'PDFs\\'
root_dst_dir = 'ExtractedText\\'

# Get the installed OCR tools
try:

    tools = pyocr.get_available_tools()
    if len(tools) == 0:
        raise UserWarning("No OCR tool found")

    tool  = tools[0]

except:
    pass
    
# Iterate through files in the PDF folder
print('Begin iterating through PDFs')
for src_dir, dirs, files in os.walk(root_src_dir):
    
    # Set the root directory
    dst_dir = src_dir.replace(root_src_dir, root_dst_dir, 1)
    print('Current dir: ' + dst_dir)
    
    # Create the directory if it doesn't exist
    print('Check and create destination directory')
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
        
    # Iterate through the files    
    print('Iterate through files')
    for file in files:
        print('')
        
        # Create file paths
        src_file = os.path.join(src_dir, file)
        dst_file = os.path.join(dst_dir, dropGarbage(file))
        print('Source file: ' + src_file)
        print('Dest   file: ' + dst_file)
          
        # Delete previous files
        print('Check and delete previous entries for same file')
        if os.path.exists(dst_file):
            
            # Unless they are exactly the same file
            if os.path.samefile(src_file, dst_file):
                continue
                
            os.remove(dst_file)
            
        # Extract the text from the PDF
        print('Extract text from PDF')
        extractedText = extractTextFromPDF(src_file, tool)
        
        # Clean up the text as much as possible
        print('Clean up the extracted text')
        soup          = bs(extractedText)
        raw           = soup.get_text()
        txtfile       = re.sub('pdf', 'txt', dst_file)
        
        # Write out the .txt file
        print('Write out the text file')
        f = io.open(txtfile, 
                    encoding = 'utf-8', 
                    mode = 'w')
        f.write(raw)
        f.close()
        

Set root Directories
Begin iterating through PDFs
Current dir: ExtractedText\
Check and create destination directory
Iterate through files
Current dir: ExtractedText\OldSet
Check and create destination directory
Iterate through files

Source file: PDFs\OldSet\a59fulELeqIRtdP_zyI1F_BVRa.y-s42003_018_0260_y.txt.ann.pdf
Dest   file: ExtractedText\OldSet\s42003_018_0260_y.txt.ann.pdf
Check and delete previous entries for same file
Extract text from PDF
Clean up the extracted text
Write out the text file

Source file: PDFs\OldSet\a8lxxCHk3HXTleqg2.Wf4RxRevaS-summer_activity_patterns_for_mosses_and_lichens_in_maritime_antarctica.txt.ann.pdf
Dest   file: ExtractedText\OldSet\summer_activity_patterns_for_mosses_and_lichens_in_maritime_antarctica.txt.ann.pdf
Check and delete previous entries for same file
Extract text from PDF
Clean up the extracted text
Write out the text file

Source file: PDFs\OldSet\aDXuKg2KPEH_fR5A797NwmzPvSxy-fmicb_10_01018.txt.ann.pdf
Dest   file: ExtractedText\OldSet\fm

End.