In [5]:
from PIL import Image, ImageOps
import numpy as np
import time
from copy import deepcopy
from difflib import SequenceMatcher
from paddleocr import PaddleOCR
ocr_model = PaddleOCR(lang='en',use_angle_cls=True,show_log=False)

In [1750]:
def run_ocr_query(image):
    image = ImageOps.exif_transpose(image)
    img = np.asarray(image)
    raw = ocr_model.ocr(img, cls=True)[0]
    ignore_table_flag = False
    
    unorg_header = query_word_horizontal(raw,['Description','Particular','Item'],single=False,get_after=False)
    header = check_merge_vertical(unorg_header)

    startpoint = query_word_single(raw,['Description','Particular','Item'],instance_from_front=True)
    if startpoint:
        endpoint = query_word_single(raw[raw.index(startpoint)+len(header)::],['Total'],instance_from_front=True)
    else:
        print("ERROR: Cannot find table")
        unorg_data = raw[:]
        ignore_table_flag = True
    if not ignore_table_flag:
        if endpoint:
            table_data = raw[raw.index(startpoint):raw.index(endpoint):]
            unorg_data = raw[:raw.index(startpoint):]+raw[raw.index(endpoint)::]
        else:
            table_data = raw[raw.index(startpoint)::]
            unorg_data = raw[:raw.index(startpoint):]

        for i in unorg_header:
            if i in table_data:
                table_data.remove(i)
        for i in header[::-1]:
            table_data.insert(0,i)
        table_data = order_by_tbyx([table_data])[0]

        column = query_word_vertical(table_data,['Description','Particular','Item'],single=False,get_after=True)
        org_table_data = get_table_data(table_data,header,column)
        combined_table_data = combine_extra_terms(org_table_data)
        row_data = []
        for i in combined_table_data[1::]:
            temp = []
            for j in i:
                if j!='-':
                    temp.append(j[1][0])
                else:
                    temp.append(None)
            row_data.append(temp)
        extracted_data = {"table_data":{"header":[i[1][0] for i in combined_table_data[0]],"row_data":row_data}}
    
    other_data = organise_data(unorg_data)
    extracted_data.update(other_data)
    return extracted_data

In [1730]:
def order_by_tbyx(ocr_info):
    res = sorted(ocr_info, key=lambda r: (r[0][1], r[0][0]))
    for i in range(len(res) - 1):
        for j in range(i, 0, -1):
            if abs(res[j + 1][0][1] - res[j][0][1]) < 20 and (res[j + 1][0][0] < res[j][0][0]):
                tmp = deepcopy(res[j])
                res[j] = deepcopy(res[j + 1])
                res[j + 1] = deepcopy(tmp)
            else:
                break
    return res

In [1731]:
def query_word_single(data,words,instance_from_front):
    data_copy = deepcopy(data)
    if not instance_from_front:
        data_copy = data_copy[::-1]
    for word in words:
        for i in data_copy:
            if word.casefold() in i[1][0].casefold() or SequenceMatcher(None, word.casefold(), i[1][0].casefold()).ratio()>0.7:
                return i

In [1732]:
def query_word_vertical(data,words,single,get_after,absolute=False):
    output = []
    word_loc = []
    for word in words:
        for i in data:
            if not absolute:
                if word.casefold() in i[1][0].casefold() or SequenceMatcher(None, word.casefold(), i[1][0].casefold()).ratio()>0.7:
                    word_loc.append(i)
                    break
            else:
                if word == i[1][0]:
                    word_loc.append(i)
                    break

    for i in word_loc:
        i_xmin = min([j[0] for j in i[0]])
        i_xmax = max([j[0] for j in i[0]])
        i_ymin = min([j[1] for j in i[0]])
        for j in data:
            j_xmax = max([k[0] for k in j[0]])
            j_xmin = min([k[0] for k in j[0]])
            j_ymin = min([k[1] for k in j[0]])
            if not (j_xmax<i_xmin and j_xmin<i_xmin) and not (j_xmax>i_xmax and j_xmin>i_xmax):
                if get_after:
                    if j_ymin>=i_ymin:
                        output.append(j)
                    if single:
                        break
                else:
                    output.append(j)
                    if single:
                        break
        if len(output)>0:
            break
    return output

In [1733]:
def query_word_horizontal(data,words,single,get_after,absolute=False):
    output = []
    word_loc = []
    for word in words:
        for i in data:
            if not absolute:
                if word.casefold() in i[1][0].casefold() or SequenceMatcher(None, word.casefold(), i[1][0].casefold()).ratio()>0.7:
                    word_loc.append(i)
                    break
            else:
                if word == i[1][0]:
                    word_loc.append(i)
                    break
    
    for i in word_loc:
        i_ymax = max([j[1] for j in i[0]])
        i_ymin = min([j[1] for j in i[0]])
        i_xmin = min([j[0] for j in i[0]])
        for j in data:
            j_ymax = max([k[1] for k in j[0]])
            j_ymin = min([k[1] for k in j[0]])
            j_xmin = min([k[0] for k in j[0]])
            if not (i_ymax<j_ymin and i_ymin<j_ymin) and not (i_ymax>j_ymax and i_ymin>j_ymax):
                if get_after:
                    if j_xmin>=i_xmin:
                        output.append(j)
                    if single:
                        break
                else:
                    output.append(j)
                    if single:
                        break
        if len(output)>0:
            break
    return output

In [1734]:
def merge_words(first,second):
    first_xmax = max([i[0] for i in first[0]])
    first_xmin = min([i[0] for i in first[0]])
    first_ymax = max([i[1] for i in first[0]])
    first_ymin = min([i[1] for i in first[0]])
    
    second_xmax = max([i[0] for i in second[0]])
    second_xmin = min([i[0] for i in second[0]])
    second_ymax = max([i[1] for i in second[0]])
    second_ymin = min([i[1] for i in second[0]])
    
    pos_arr = [[min(first_xmin,second_xmin),min(first_ymin,second_ymin)],[max(first_xmax,second_xmax),min(first_ymin,second_ymin)],[max(first_xmax,second_xmax),max(first_ymax,second_ymax)],[min(first_xmin,second_xmin),max(first_ymax,second_ymax)]]
    item_tuple = (first[1][0]+' '+second[1][0],(first[1][1]+second[1][1])*0.5)
    
    return [pos_arr,item_tuple]

In [1735]:
def check_merge_vertical(data):
    new_data = deepcopy(data)
    merge_occured = True
    while merge_occured:
        merge_occured = False
        for i in new_data:
            i_xmax = max([j[0] for j in i[0]])
            i_xmin = min([j[0] for j in i[0]])
            i_yave = sum([j[1] for j in i[0]])/4.0
            for j in new_data:
                j_xmax = max([k[0] for k in j[0]])
                j_xmin = min([k[0] for k in j[0]])
                j_yave = sum([k[1] for k in j[0]])/4.0
                if i!=j and not (i_xmax<j_xmin and i_xmin<j_xmin) and not (i_xmax>j_xmax and i_xmin>j_xmax):
                    for k in new_data[::-1]:
                        if k == i or k == j:
                            new_data.remove(k)
                    if i_yave>j_yave:
                        new_entry = merge_words(j,i)
                    else:
                        new_entry = merge_words(i,j)
                    new_data.append(new_entry)
                    merge_occured = True
                    break
    return sorted(new_data,key= lambda x:x[0][0][0])

In [1736]:
def check_merge_horizontal(data):
    new_data = deepcopy(data)
    midpoint_y = []
    for i in data:
        midpoint_y.append(sum([j[1] for j in i[0]])/4.0)
    org_arr_of_differences = []
    for i in range(len(midpoint_y)-1):
        org_arr_of_differences.append(abs(midpoint_y[i+1]-midpoint_y[i]))
    org_arr_of_differences = np.array(org_arr_of_differences)
    arr_of_differences = org_arr_of_differences[:]

    while (arr_of_differences.std()>10):
        arr_of_differences = arr_of_differences[(arr_of_differences<np.quantile(arr_of_differences,0.99)) & (arr_of_differences>np.quantile(arr_of_differences,0.01))]

    merge_occured = True
    while merge_occured:
        merge_occured = False
        for n,i in enumerate(new_data):
            i_yave = sum([j[1] for j in i[0]])/4.0
            for m,j in enumerate(new_data):
                j_yave = sum([k[1] for k in j[0]])/4.0
                if i!=j and abs(i_yave-j_yave)<=max(arr_of_differences) and n != 0 and m!=0:
                    for k in new_data[::-1]:
                        if k == i or k == j:
                            new_data.remove(k)
                    if i_yave>j_yave:
                        new_entry = merge_words(j,i)
                    else:
                        new_entry = merge_words(i,j)
                    new_data.append(new_entry)
                    merge_occured = True
                    break
    return sorted(new_data,key= lambda x:x[0][0][1])

In [1737]:
def get_table_data(data,header,column):
    data_copy = deepcopy(data)
    output = [['-']*len(header) for _ in range(len(column))]
    
    for col,i in enumerate(header):
        i_xmax = max([k[0] for k in i[0]])
        i_xmin = min([k[0] for k in i[0]])
        for row,j in enumerate(column):
            j_ymax = max([k[1] for k in j[0]])
            j_ymin = min([k[1] for k in j[0]])
                
            for k in data_copy[::-1]:
                k_xmax = max([n[0] for n in k[0]])
                k_xmin = min([n[0] for n in k[0]])
                k_ymax = max([n[1] for n in k[0]])
                k_ymin = min([n[1] for n in k[0]])
                    
                if not (k_xmax>=i_xmax and k_xmin>=i_xmax) and not (k_xmax<=i_xmin and k_xmin<=i_xmin) and not (k_ymax>=j_ymax and k_ymin>=j_ymax) and not (k_ymax<=j_ymin and k_ymin<=j_ymin):
                    if output[row][col] == '-':
                        output[row][col] = k
                        data_copy.remove(k)
                    else:
                        ymax = max([n[1] for n in output[row][col][0]])
                        if ymax<k_ymax:
                            new_entry = merge_words(output[row][col],k)
                        else:
                            new_entry = merge_words(k,output[row][col])
                        output[row][col] = new_entry
                        data_copy.remove(k)
    for i in output[::-1]:
        if all([j == '-' for j in i]):
            output.remove(i)
    return output

In [1738]:
def combine_extra_terms(data):
    data_copy = deepcopy(data)
    all_row_yave = []
    for i in data_copy:
        row_yave = []
        for j in i:
            if j!='-':
                row_yave.append(sum([k[1] for k in j[0]])/4.0)
        all_row_yave.append(sum(row_yave)/len(row_yave))
    difference_array = []
    for i in range(len(all_row_yave[:-1:])):
        difference_array.append(all_row_yave[i+1]-all_row_yave[i])
    std_dev = np.array(difference_array).std()
    
    while(std_dev>10):
        outlier_row = difference_array.index(min(difference_array)) + 1
        min_dist = 1000000
        min_elem = []
        for i in data_copy[outlier_row]:
            if i == '-':
                continue
            i_xave = sum([j[0] for j in i[0]])/4.0
            i_yave = sum([j[1] for j in i[0]])/4.0
            for j in data_copy[outlier_row-1]:
                if j == '-':
                    continue
                j_xave = sum([k[0] for k in j[0]])/4.0
                j_yave = sum([k[1] for k in j[0]])/4.0
                
                dist = ((i_xave-j_xave)**2 + (i_yave-j_yave)**2)**0.5
                if dist<min_dist:
                    min_dist = dist
                    if i_yave<j_yave:
                        min_elem = [i,j]
                    else:
                        min_elem = [j,i]  
        new_entry = merge_words(min_elem[0],min_elem[1])
        for row,i in enumerate(data_copy):
            for col,j in enumerate(i):
                if j == min_elem[0]:
                    data_copy[row][col] = new_entry
                elif j == min_elem[1]:
                    data_copy[row][col] = '-'
                    
        for i in data_copy[::-1]:
            if all([j == '-' for j in i]):
                data_copy.remove(i)
        
        all_row_yave = []
        for i in data_copy:
            row_yave = []
            for j in i:
                if j!='-':
                    row_yave.append(sum([k[1] for k in j[0]])/4.0)
            all_row_yave.append(sum(row_yave)/len(row_yave))
        difference_array = []
        for i in range(len(all_row_yave[:-1:])):
            difference_array.append(all_row_yave[i+1]-all_row_yave[i])
        std_dev = np.array(difference_array).std()
        
        row_sum = []
        for i in data_copy:
            row_sum.append(sum([1 if j!='-' else 0 for j in i]))
        if all([i>1 for i in row_sum]):
            break
        
    return data_copy

In [1739]:
def organise_data(data):
    vertical_output = {}
    horizontal_output = {}
    key_data = []
    for i in data:
        if any([j in i[1][0] for j in '1234567890']):
            vertical_output[i[1][0]] = ''
            horizontal_output[i[1][0]] = ''
            key_data.append(i)
    
    for i in key_data:
        possible_vertical = []
        i_xmin = min([j[0] for j in i[0]])
        i_xmax = max([j[0] for j in i[0]])
        i_ymin = min([j[1] for j in i[0]])
        for j in data:
            j_xmax = max([k[0] for k in j[0]])
            j_xmin = min([k[0] for k in j[0]])
            j_ymin = min([k[1] for k in j[0]])
            if i!=j and not (j_xmax<i_xmin and j_xmin<i_xmin) and not (j_xmax>i_xmax and j_xmin>i_xmax) and j_ymin<=i_ymin:
                possible_vertical.append(j) 
        if possible_vertical:
            closest_vertical = sorted(possible_vertical, key=lambda x: x[0][0][1])[-1]
            if len(vertical_output):
                vertical_output[i[1][0]] = closest_vertical[1][0] + " " + vertical_output[i[1][0]]
            else:
                vertical_output[i[1][0]] = closest_vertical[1][0]
    
    for i in key_data:
        possible_horizontal = []
        i_ymin = min([j[1] for j in i[0]])
        i_ymax = max([j[1] for j in i[0]])
        i_xmin = min([j[0] for j in i[0]])
        for j in data:
            j_ymax = max([k[1] for k in j[0]])
            j_ymin = min([k[1] for k in j[0]])
            j_xmin = min([k[0] for k in j[0]])
            if i!=j and not (j_ymax<i_ymin and j_ymin<i_ymin) and not (j_ymax>i_ymax and j_ymin>i_ymax) and j_xmin<=i_xmin:
                possible_horizontal.append(j) 
        if possible_horizontal:
            closest_horizontal = sorted(possible_horizontal, key=lambda x: x[0][0][0])[-1]
            if len(horizontal_output):
                horizontal_output[i[1][0]] = closest_horizontal[1][0] + " " + horizontal_output[i[1][0]]
            else:
                horizontal_output[i[1][0]] = closest_horizontal[1][0]
    
    output = {}
    for k,v in horizontal_output.items():
        if v=='':
            v = 'uncategorizable Data'
            if 'uncategorizable Data' in output:
                output[v] = output[v] + ' ' + k
            else:
                output[v] = k
        else:
            output[v] = k
    return output

In [1751]:
# img_path = '../data/R-4-1.jpg' #Row data merged together
img_path = '../data/R-4-17.jpg' 
# img_path = '../data/R-4-24.jpg' #slanted
# img_path = '../data/R-4-34.jpg' #header shifted
# img_path = '../data/R-4-35.jpg' 
# img_path = '../data/R-4-37.jpg' 
# img_path = '../data/R-4-39.jpg' 
# img_path = '../data/R-4-43.jpg' 
# img_path = '../data/R-4-44.jpg' 
# img_path = '../data/R-4-46.jpg' #All data became header for some reason. data combiner too aggresive
# img_path = '../data/R-4-47.jpg' #Number got cancelled out on invoice
# img_path = '../data/R-4-48.jpg'
# img_path = '../data/R-4-50.jpg' 
# img_path = '../data/R-4-56.jpg' #Alot of data became header. data combiner too aggresive

image = Image.open(img_path).convert("RGB")
start_time = time.time()
output = run_ocr_query(image)
print("Time Taken: %s seconds" % (time.time() - start_time))

Time Taken: 19.903754234313965 seconds


In [1752]:
output

{'table_data': {'header': ['NO',
   'DESCRIPTION/PACKING',
   'QUANTITY',
   'UNIT PRICE',
   'DISC',
   'NETT'],
  'row_data': [[None,
    'DELMONTE FIESTA TROPICALFRUIT MIX(SS2SI',
    '1CTN',
    '33.60',
    None,
    '33.60'],
   [None,
    'DEL MONTE FIESTA TROPICAL FRUIT MIX(5S251-FOC',
    '1PCS',
    None,
    None,
    '0.00'],
   [None, "LIBBY'S CORNED BEEF REGULAR", '12PCS', None, None, '45.36'],
   [None,
    'DEL MONTE PRUNE JUICE32OZ(HVI728001)',
    '1CTN',
    '61.32',
    None,
    '61.32'],
   [None,
    'DELMONTEPRUNEJUICE32OZ(HVI728001-FOC 12X320Z',
    '1PCS',
    None,
    None,
    '0.00']]},
 'uncategorizable Data': 'X(6567597303 &0E',
 'COMPANY REG NO:199102401H ': 'GST REG NOM2-0099779-4',
 'GST REG NOM2-0099779-4 ': 'COMPANY REG NO:199102401H',
 'INVOICE NO ': 'M210.030',
 '#01-79YISHUN RING ROAD ': 'BLK103',
 'IMONTH ': '07Auaust 2021',
 'SINGAPORE 760103 SINGAPORE 760103 ': '#01-79YISHUN RING ROAD',
 '#01-79YISHUN RING ROAD #01-79YISHUN RING ROAD ': 'SINGA