In [1]:
from PIL import Image 
import pytesseract 
import sys 
from pdf2image import convert_from_path 
import os 

In [2]:
# Tell the system the directory of our tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [3]:
# Path of the pdf 
PDF_file = "bankstatementinput.pdf"

In [4]:
# Store all the pages of the PDF in a variable 
pages = convert_from_path(PDF_file)

In [5]:
'''
Part 1 - Convert PDF into Images
'''

# Counter to store images of each page of PDF to image 
image_counter = 1
  
# Iterate through all the pages stored above 
for page in pages: 
  
    # Declaring filename for each page of PDF as JPG 
    # For each page, filename will be: 
    # PDF page 1 -> page_1.jpg 
    # .... 
    # PDF page n -> page_n.jpg 
    filename = "page_"+str(image_counter)+".jpg"
      
    # Save the image of the page in system 
    page.save(filename, 'JPEG') 
  
    # Increment the counter to update filename 
    image_counter = image_counter + 1

In [6]:
  
''' 
Part 2 - Recognizing text from the images using OCR 
'''

# Variable to get count of total number of pages 
filelimit = image_counter-1
  
# Creating a text file to write the output 
outfile = "output_text.txt"
  
# Open the file in append mode so that  
# All contents of all images are added to the same file 
f = open(outfile, "a") 

# Iterate from 1 to total number of pages 
for i in range(1, filelimit + 1): 
  
    # Set filename to recognize text from 
    # Again, these files will be: 
    # page_1.jpg 
    # page_2.jpg 
    # .... 
    # page_n.jpg 
    filename = "page_"+str(i)+".jpg"
          
    # Recognize the text as string in image using pytesserct 
    text = str(((pytesseract.image_to_string(Image.open(filename)))))
  
    # The recognized text is stored in variable text 
    # Any string processing may be applied on text 
    # Here, basic formatting has been done: 
    # In many PDFs, at line ending, if a word can't 
    # be written fully, a 'hyphen' is added. 
    # The rest of the word is written in the next line 
    # Eg: This is a sample text this word here GeeksF- 
    # orGeeks is half on first line, remaining on next. 
    # To remove this, we replace every '-\n' to ''. 
    text = text.replace('-\n','')
  
    # Finally, write the processed text to the file. 
    f.write(text)

# Close the file
f.close()

In [7]:
text_file = open('output_text.txt')
file_content = text_file.read().replace('\n',' ')
#print(file_content)  
#text_file.close()

In [8]:
import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [9]:
def preprocess(document):
    document = ' '.join([i for i in document.split() if i not in stop])
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences

In [10]:
def extract_name(document):
    name_is_upper_case = re.findall(r'([A-Z]+\s?[A-Z]+[^a-z0-9\W])',document) 
    name_is_upper_case = ' '.join(name_is_upper_case)
    sentences = preprocess(name_is_upper_case)
    names = []
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join([c[0] for c in chunk]))
    return names

In [11]:
def account_number(file_content):
    r = re.compile(r'\d{2,3}-\d{6,11}-\d{1,2}')
    return r.findall(file_content)

In [12]:
def extract_address(file_content):
    after_person_name = re.split(''.join(names),file_content)[1]
    address = re.compile(r'(.*) [A-Z]+ [0-9]{6}')
    return address.findall(after_person_name)

In [13]:
def extract_statement_date(file_content):
    stat_date = re.compile(r'[0-9]{1,2} [a-zA-Z]{3} [0-9]{4}')
    return [stat_date.findall(file_content)[0]]

In [14]:
def extract_dates(file_content):
    date_transact = re.compile(r' [0-9]{1,2} [a-zA-Z]{3} ')
    return date_transact.findall(file_content)

In [15]:
def extract_desc(file_content):
    desc_transact = re.compile(r' [0-9]{1,2} [a-zA-Z]{3} (.*?) \d{2,4}\.\d*')
    return desc_transact.findall(file_content)

In [16]:
def extract_amount(file_content):
    amount_transact = re.compile(r'\s\d{2,4}\.\d*')
    return amount_transact.findall(file_content)

In [17]:
if __name__ == '__main__':
    names = extract_name(file_content)
    account = account_number(file_content)
    addresses = extract_address(file_content)
    stat_date = extract_statement_date(file_content)
    date_transact = extract_dates(file_content)
    desc_transact = extract_desc(file_content)
    amount = extract_amount(file_content)

In [18]:
print(names, account, addresses, stat_date, date_transact, desc_transact, amount)

['JOHN RIEGER'] ['12-145753-2'] ['  KTGIF SINGAPORE PTE. LTD. 26B TEMPLE STREET  #03-00 '] ['31 Aug 2018'] [' 28 Aug ', ' 30 Aug ', ' 30 Aug ', ' 31 Aug ', ' 31 Aug '] ['Quick Cheque Deposit', 'Point-of-Sale Transaction', 'Point-of-Sale Transaction', 'Point-of-Sale Transaction', 'Point-of-Sale Transaction'] [' 1254.12', ' 20.00', ' 465.00', ' 26.50', ' 16.00', ' 527.50']
