In [1]:
import locale
import pytesseract
from pytesseract import Output
from PIL import Image
import cv2
import numpy as np
import re
import json
import pandas as pd
from fuzzywuzzy import fuzz



In [2]:
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract'
locale.setlocale(locale.LC_ALL, 'german')

'German_Germany.1252'

In [3]:
class Box:
    
    def __init__(self, x, y, w, h, text):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        self.coords = (x, y), (x+ w, y + h)
        self.text = text
    
    def draw_bounding_box(self, cv_img, color, x, y, w, h):
        cv2.rectangle(cv_img, (x, y), (x+ w, y + h), color, 2)
    
    def draw_label(self, cv_img, color, label, x, y, w, h):
        y_label = y - 15 if y - 15 > 15 else y + 15
        cv2.putText(img=cv_img,
                    text=label,
                    org=(x, y_label),
                    fontFace=cv2.FONT_HERSHEY_DUPLEX,
                    color=color,
                    lineType=2,
                    fontScale=0.8, )   

In [4]:
class RegionOfInterest:
    
    def __init__(self, y, y2, x, x2, label):
        self.y = y
        self.y2 = y2
        self.x = x
        self.x2 = x2
        self.label = label

In [5]:
class Level:
    BLOCK = 2
    TEXTLINE = 3
    WORD = 4
    SYMBOL = 5

In [6]:
im = Image.open("./data/invoice.jpg")
cv_img = cv2.imread("./data/invoice.jpg")
gray_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
image = Image.fromarray(gray_img)

extracted_data = pytesseract.image_to_data(im, output_type=Output.DICT, lang='deu')
df = pd.DataFrame.from_dict(extracted_data)

In [9]:

amount_boxes = len(df['level'])

# draw bounding boxes around all boxes
for i in range(amount_boxes):
    if df['level'][i] == Level.BLOCK:
        (x, y, w, h) = (df['left'][i], df['top'][i], df['width'][i], df['height'][i])
        print(df['left'][i], df['top'][i], df['width'][i], df['height'][i])
        cv2.rectangle(cv_img, (x, y), (x + w, y + h), (0, 255, 0), 2)

cv2.imwrite('./data/img/boxes.jpg', cv_img)

df[:20]

260 378 396 14
260 464 258 94
264 855 347 75
260 991 403 68
1033 217 351 23
1032 273 350 23
134 144 1385 113
1020 451 450 290
253 1101 1171 7
253 1309 1171 7
253 1415 1171 8
248 1163 1168 329
603 1858 305 87
260 1858 206 64
1032 1858 244 88


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
0,1,1,0,0,0,0,0,0,1653,2339,-1,
1,2,1,1,0,0,0,260,378,396,14,-1,
2,3,1,1,1,0,0,260,378,396,14,-1,
3,4,1,1,1,1,0,260,378,396,14,-1,
4,5,1,1,1,1,1,260,378,51,12,91,Thomas
5,5,1,1,1,1,2,316,378,39,11,93,GmbH
6,5,1,1,1,1,3,360,378,8,11,91,&
7,5,1,1,1,1,4,372,378,20,11,92,Co.
8,5,1,1,1,1,5,398,378,17,11,94,KG
9,5,1,1,1,1,6,419,384,4,2,93,-


In [386]:


def box_ranges(df):
    level_two = df['level'] == 2
    df_two = df[level_two]
    box_ranges = []
    for i, (a, b) in enumerate(zip(df_two.iterrows(), df_two.iloc[1:].iterrows())):
        box_ranges.append((a[0], b[0]))
        if i == len(df_two) - 2:
            box_ranges.append((b[0], df.index[-1]))
    
    return  box_ranges


In [388]:


def get_boxes(df, ranges):
    level_two = df['level'] == 2
    df_two = df[level_two]
    boxes = []
    joined = ' '
    
    for current, (start, end) in zip(df_two.iterrows(), ranges):
        coords = current[1]['left'], current[1]['top'], current[1]['width'], current[1]['height']
        curr_subdf = df.iloc[start:end+1]
        joined = ' '
        text_list = []
        for current_sub in curr_subdf.iterrows():
            if current_sub[1]['level'] == 5:
                text_list.append(current_sub[1]['text'])
               
        boxes.append(Box(*coords, text=joined.join(text_list)))
      
    return boxes

In [389]:

def match_address(text):
    adr = re.findall(patterns['address'] ,text)
    
    return adr

def match_iban(text):
    iban = re.findall(patterns['iban'] ,text)
    
    return iban

def match_bic(text):
    bic = re.findall(patterns['bic'] ,text)
    
    return bic

def match_plz(text):
    plz = re.findall(patterns['plz'] ,text)
    
    return plz

def find_max_currency(text):
    currencies = re.findall(patterns['balance_due'] ,text)
    floated_currencies = [float(locale.atof(i)) for i in currencies]
    
    if floated_currencies:
        max_floated_currency = max(floated_currencies)
        balance_due = locale.currency(max_floated_currency)
    return balance_due

with open("./data/german-cities.json", encoding='utf-8') as json_file:
    german_cities = json.load(json_file)

city_list = [city['name'] for city in german_cities['data']]

patterns = {
        'balance_due': '\d{1,3}(?:[.]\d{3})*(?:[,]\d{2})',
        'iban': '[A-Z]{2}(?:[ ]?)[0-9]{2}(?:[ ]?[0-9]{4}){4}(?!(?:[ ]?[0-9]){3})(?:[ ]?[0-9]{1,2})?',
        'bic': '/[A-Z]{6}[A-Z0-9]{2}([A-Z0-9]{3})?/i',
        'plz': '[0-9]{5}',
        'address': '^((?:{L}| |\d|\.|-)+?) (\d+(?: ?- ?\d+)? *[a-zA-Z]?) (\d{5}) ((?:{L}| |-)+)(?: *\(([^\)]+)\))?$',
}

ranges = box_ranges(df)
boxes = get_boxes(df, ranges)

In [391]:
im = Image.open("./data/invoice.jpg")
cv_img = cv2.imread("./data/invoice.jpg")
gray_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
image = Image.fromarray(gray_img)

regions_of_interest = []
extracted_text = pytesseract.image_to_string(im, lang='deu')

iban = match_iban(extracted_text )
bic = match_bic(extracted_text )
balance_due = find_max_currency(extracted_text)
plz = match_plz(extracted_text )
adr = match_address(extracted_text )


LABELS = ["calculation", "Bank details", "recipient", 'invoice details']
COLORS = np.random.uniform(0, 255, size=(len(LABELS), 3))

for box in boxes:
    if balance_due in box.text:
        print('sum detected')
        box.draw_bounding_box(cv_img, COLORS[0], box.x, box.y, box.w, box.h)
        box.draw_label(cv_img, COLORS[0], LABELS[0], box.x, box.y, box.w, box.h)
        regions_of_interest.append(RegionOfInterest(box.y, box.y + box.h, box.x, box.x + box.w, LABELS[0]))
    for ib in iban:    
        if ib in box.text:
            print('iban detected')
            box.draw_bounding_box(cv_img, COLORS[1], box.x, box.y, box.w, box.h)
            box.draw_label(cv_img, COLORS[1], LABELS[1], box.x, box.y, box.w, box.h)
            regions_of_interest.append(RegionOfInterest(box.y, box.y + box.h, box.x, box.x + box.w, LABELS[1]))
    
    for city in city_list:
        ratios = [fuzz.ratio(city, text) for text in box.text.split(' ')]
        for ratio in ratios:
            if ratio > 80:
                print('city detected')

                for i, p in enumerate(plz):
                    if p in box.text.split(' ') and i == 0: 
                        print('recipient detected')
                        box.draw_bounding_box(cv_img, COLORS[2], box.x, box.y, box.w, box.h)
                        box.draw_label(cv_img, COLORS[2], LABELS[2], box.x, box.y, box.w, box.h)
                        regions_of_interest.append(RegionOfInterest(box.y, box.y + box.h, box.x, box.x + box.w, LABELS[2]))
                        
    if 'Rechnungsnr' in box.text:
        print('bank details detected')
        box.draw_bounding_box(cv_img, COLORS[3], box.x, box.y, box.w, box.h)
        box.draw_label(cv_img, COLORS[3], LABELS[3], box.x, box.y, box.w, box.h)
        regions_of_interest.append(RegionOfInterest(box.y, box.y + box.h, box.x, box.x + box.w, LABELS[3]))

cv2.imwrite('./data/img/RegionsOfinterest.jpg', cv_img) 


city detected
recipient detected
city detected
city detected
bank details detected
sum detected
iban detected
city detected
city detected
recipient detected


True

In [395]:
im = Image.open("./data/invoice.jpg")
cv_img = cv2.imread("./data/invoice.jpg")

gray_img = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
image = Image.fromarray(gray_img)

for rol in regions_of_interest:
    crop_img = cv_img[rol.y:rol.y2, rol.x:rol.x2]
    cv2.imwrite('./data/img/RegionsOfInterest/{name}.jpg'.format(name=rol.label), crop_img) 


378 392 260 656
recipient
451 741 1020 1470
invoice details
1163 1492 248 1416
calculation
1858 1945 603 908
Bank details
1858 1922 260 466
recipient
