In [6]:
'''The purpose of this code is to traverse a directory and convert any pdf files encountered to text. 

Input  =     The input for this code can either be a machine readable pdf or a scanned file (image). 
Output =     The output will be a text file whos file name is the same as the pdf's that was originally input. 

Required Installations:

pytesseract = https://pypi.python.org/pypi/pytesseract
wand        = https://pypi.python.org/pypi/Wand
Pillow      = https://pypi.python.org/pypi/Pillow/2.2.1
imagemagick = https://www.imagemagick.org/script/index.php


author:  Chris Cirelli 01/31/2018
'''


"The purpose of this code is to traverse a directory and convert any pdf files encountered to text. \n\nInput  =     The input for this code can either be a machine readable pdf or a scanned file (image). \nOutput =     The output will be a text file whos file name is the same as the pdf's that was originally input. \n\nRequired Installations:\n\npytesseract = https://pypi.python.org/pypi/pytesseract\nwand        = https://pypi.python.org/pypi/Wand\nPillow      = https://pypi.python.org/pypi/Pillow/2.2.1\nimagemagick = https://www.imagemagick.org/script/index.php\n\n\nauthor:  Chris Cirelli 01/31/2018\n"

In [None]:
# SET THE TARGET DIRECTORY AND DEFINE DIR_LIST

In [7]:
'''The purpose of this code is to create the traget directory where the user's PDF files are located.
   In addition, the object 'Dir_list' has been created, which is a list object with all of
   the files in this directory.  This object will be used in most of the remaining code'''

import os

Target_dir = os.chdir('/Volumes/insight/Legal Analytics Sprint-S18/Team Folders/Team Wang/Chris/Classified Cases/PDF Image Poor Quality')
Dir_list = os.listdir()

In [None]:
# TRAVERSE TARGET DIRECTORY AND IDENTIFY PDF DOCS

In [8]:
def get_list_pdf_Files(Target_dir, Dir_list):
    '''The purpose of this function is to change to the correct working director & obtain list of pdf files
    Input  = Target directory and Dir_list
    Output = List object with only PDF files. 
    '''    
    import os
    
    os.chdir(Target_dir)
    # Iterate over the Dir_list and return only those files that contain '.pdf'
    File_list = [x for x in Dir_list if '.pdf' in x]
    
    return File_list


In [None]:
# CONVERT PDF IMAGES TO PNG FILES

In [10]:
def convert_pdf_to_png(Pdf_File):
    
    '''The purpose of this function is to convert each page of a pdf to a png file.
    
    Input  = The input is a single pdf file, which be either machine readable or an image. 
    Output = a png file for each page of the pdf. 
    '''
    # Import pre-installed packages
    from wand.image import Image
    from wand.color import Color
    
    # Open the Pdf file as an image; set the resolution to 400 (assumed to be optimal resolution)
    with Image(filename = Pdf_File, resolution = 400) as img:
        # Align the image, set channel to blue - this turns the background to yellow, highlighting better black txt
        img.evaluate(operator='rightshift', value=1, channel='blue')
        # Adjust sharpness (this appears to help for poorer quality pdf images)
        img.level(0.2, 0.9, gamma=1.1)
        # Save the image to a file titled converted (this will be important for the directory cleaning function)
        img.save(filename='converted1.jpg')
        
    return None
        

In [18]:
convert_pdf_to_png(Dir_list[3])   # Take a look in this directory.  I have saved a copy of the enhanced png file 
                                  # where you can compare it to the original pdf image. 

In [None]:
# CONVERT PNG IMAGES TO A SINGLE TEXT DOCUMENT

In [26]:
def create_textFile_from_images(Pdf):
    '''The purpose of this function is to convert each png file to text file and concatenate them into a single doc.  
    Input  = Only the original pdf file from which the png files were created.  This will be used to create the
             txt file name. 
    Output = A single text file generated from one or more png files. 
    
    Note:  If write text generates a unicode error you'll need to set pytesseract config to only 
    recognize string characters.
    config= "-c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!#$%&*+,-.:;~`_=^<>?@(){}[]|/\\\"\'' -psm 6")
    '''   
    
    # Import Packages
    import PIL
    import pytesseract
    import os
    
    # Redefine Dir List within function. 
    Dir_list = os.listdir()
    
    # Drop '.pdf' from our original PDF's file name so that we can use this for the title of our txt file. 
    Pdf_file_name_remove_pdf = Pdf.replace('.pdf', '')
    
    # Create in write mode our new text file
    New_File = open(Pdf_file_name_remove_pdf + '.txt','w')  

    # Loop over the directory where we saved our png files.  Only pull those with 'converted' in title. 
    for file in Dir_list:
        if 'converted' in file:      
            # Use PIL to open the image
            im = PIL.Image.open(file)
            # Use Tesseract to convert the image to a string. 
            Text = pytesseract.image_to_string(im)           
            # While within loop, and while our text file is in write mode, write string to new text file. 
            New_File.write(Text)

    # Return none as this code writes directy to our target directory. 
    return None



In [None]:
# CLEAN UP THE DINNER TABLE ONCE YOUR FINISHED

In [27]:
def cleanUpDir_remove_png():
    '''The purpose of this function is to remove the png files created as part of the former functions
    Input  = None
    Output = None
    '''
    import os
    
    Dir_list = os.listdir()  # Needs to be called a second time after the files have been created. 
    
    # Loop over our target directory and remove any png files.  
    # Note:  remove any png files that you want to keep before running this code. 
    
    for File in Dir_list:
        if '.png' in File:
            os.remove(File)
    
    return None

In [None]:
# PIPELINE ALL OF THE FUNCTIONS TOGETHER

In [28]:
def directory_pdf_file_conversion_2text_pipeline(Target_dir):
    '''The purpose of this function is to traverse a directory, identify all pdf files, convert them to text 
    and remove any png files. 
    Input  = Target directory where files are saved
    Output  = A single text file with the same name as the pdf doc. 
 
    '''
    
    # Import Libraries
    import os
    # IMPORTANT:  Set MAGICK HOME to where it is saved on your harddrive. The code will not work unless you do this. 
    os.environ['MAGICK_HOME']='/usr/local/Cellar/imagemagick@6/6.9.9-31'

    
    # Define Dir List object
    Dir_list = os.listdir()
    
    # Get List of pdf files
    List_pdf_files = get_list_pdf_Files(Target_dir, Dir_list)       
    
    # Loop over list of pdf files:
    for pdf in List_pdf_files:
        
        # Convert pdfs to png files
        convert_pdf_to_png(pdf)
        
        # Create text files from images
        create_textFile_from_images(pdf)
        
        # clean up the dinner table once your done. 
        cleanUpDir_remove_png()
    
    return None

In [19]:
# TEST CODE

In [31]:
directory_pdf_file_conversion_2text_pipeline(Target_dir)