In [4]:
%run -i "ocr_query_merger.ipynb"

In [5]:
import import_ipynb
from difflib import SequenceMatcher
from faker import Faker
import random
import faker_commerce
from PIL import Image, ImageFont, ImageDraw, ImageFilter
import import_ipynb
import time
fake = Faker()
fake.add_provider(faker_commerce.Provider)

In [6]:
def table_data_generator(headers,max_entries,max_code_len,units,code_type):
    table = []
    number_of_rows = random.randint(1,max_entries)
    for i in range(number_of_rows):
        row_entry = []
        for j in headers:
            if j == 'index':
                row_entry.append(i+1)
            elif j == 'code':
                if code_type == 'None':
                    row_entry.append(fake.password(length=max_code_len,special_chars=False,upper_case=True))
                elif code_type == 'int':
                    row_entry.append(fake.random_int(min=10**max_code_len, max=10**(max_code_len+1)-1))
                elif code_type == 'special56':
                    row_entry.append(fake.pystr_format('?-??-###-???','ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
            elif j == 'desc':
                row_entry.append(fake.ecommerce_name()) 
            elif j == 'unit':
                if units:
                    row_entry.append(str(fake.random_int(min=1, max=100))+" PCS")
                else:
                    row_entry.append(fake.random_int(min=1, max=100))
            elif j == 'float':
                row_entry.append(fake.pyfloat(right_digits = 2,positive=True, min_value=0.1, max_value=100))
        table.append(row_entry)
    return table

In [10]:
def table_image_generator(data):
    table = table_data_generator(headers=data['headers'],max_entries=data['max_entries'],max_code_len=data['max_code_len'],units=data['units'],code_type=data['code_type'])
    T_table = [[j[i] for j in table] for i in range(len(table[0]))]
    font = ImageFont.truetype(font = 'arial',size = data['textsize'])
    image = Image.open(data['template'])
    img = ImageDraw.Draw(image)
    line_spacing = 10
    
    for i in range(len(T_table)):
        position = data['position'][i]
        for j in T_table[i]:
            string = str(j)
            if len(string)>data['max_text_len']:
                string = string[:20:]
            img.text(position,string,font=font,fill=(0, 0, 0))
            position = (position[0],position[1]+data['spacing'])
    
#     rotation_angle = random.uniform(-2,2)
    rotation_angle = 0
    rotation_center = [random.uniform(image.size[0],image.size[1]) for _ in range(2)]
#     edge_enhance_level = random.randint(0,2)
    edge_enhance_level = 0
    for i in range(edge_enhance_level):
        image = image.filter(ImageFilter.EDGE_ENHANCE)
    image = image.rotate(rotation_angle,center = rotation_center,expand=True)
    image.show()
    return {'image':image,'rot_angle':rotation_angle,'enhance_level':edge_enhance_level,'test_data':table}

In [24]:
def run_test(iterations):
    R_4_17_headers = ['index','desc','unit','float','float']
    R_4_17_position = [(15,460),(120,460),(754,460),(930,460),(1222,460)]
    R_4_17_template = '../templates/R-4-17.jpg'
    R_4_17 = {'headers':R_4_17_headers,'position':R_4_17_position,'template':R_4_17_template,'textsize':25,'spacing':63,'max_text_len':13,'code_type':'None','max_code_len':8,'max_entries':12,'units':True}

    R_4_43_headers = ['code','desc','unit','float','float','float']
    R_4_43_position = [(137,915),(268,915),(1074,920),(1225,915),(1432,915),(1565,915)]
    R_4_43_template = '../templates/R-4-43.jpg'
    R_4_43 = {'headers':R_4_43_headers,'position':R_4_43_position,'template':R_4_43_template,'textsize':30,'spacing':35,'max_code_len':5,'code_type':'None','max_text_len':13,'max_entries':15,'units':True}

    R_4_46_headers = ['index','code','desc','unit','float','float']
    R_4_46_position = [(120,690),(190,690),(417,690),(1082,690),(1175,690),(1340,690)]
    R_4_46_template = '../templates/R-4-46.jpg'
    R_4_46 = {'headers':R_4_46_headers,'position':R_4_46_position,'template':R_4_46_template,'textsize':24,'spacing':40,'max_code_len':13,'code_type':'int','max_text_len':13,'max_entries':15,'units':False}

    R_4_47_headers = ['index','desc','unit','float','float']
    R_4_47_position = [(198,840),(269,840),(1050,840),(1192,840),(1500,840)]
    R_4_47_template = '../templates/R-4-47.jpg'
    R_4_47 = {'headers':R_4_47_headers,'position':R_4_47_position,'template':R_4_47_template,'textsize':24,'spacing':61,'max_code_len':13,'code_type':'int','max_text_len':13,'max_entries':12,'units':True}

    R_4_56_headers = ['index','code','desc','unit','float','float']
    R_4_56_position = [(53,486),(97,486),(348,486),(1073,486),(1293,486),(1550,486)]
    R_4_56_template = '../templates/R-4-56.jpg'
    R_4_56 = {'headers':R_4_56_headers,'position':R_4_56_position,'template':R_4_56_template,'textsize':26,'spacing':40,'max_code_len':13,'code_type':'special56','max_text_len':13,'max_entries':20,'units':True}
    
    template_options = [R_4_17,R_4_43,R_4_46,R_4_47,R_4_56]
    result_str = []
    
    for i in range(iterations):
        start_time = time.time()
        test = table_image_generator(template_options[random.randint(0,len(template_options)-1)])
        results = run_ocr_merger(test['image'],type_of_document='invoice')
        print("Test ",i+1," Parameters")
        print("Rotation Angle: ",test['rot_angle'],"\tEnhance Level: ", test['enhance_level'])
        print("Results")
        if results:
            table_found = 1
            count = 0
            similarity = 0
            history=[]
            for j in np.array(test['test_data']).flatten().tolist():
                for k in np.array(results['table_data']['row_data']).flatten().tolist()[:len(np.array(test['test_data']).flatten().tolist()):]:
                    if k!=None and k not in history and (str(j).casefold() in str(k).casefold() or str(k).casefold() in str(j).casefold() or SequenceMatcher(None, str(j).casefold(), str(k).casefold()).ratio()>0.6):
                        similarity+=1
                        history.append(k)
                        break
            similarity = 100*similarity/len(np.array(test['test_data']).flatten().tolist())
            
            print("% Similarity: ",similarity,"%",end='\t')
            count = 0
            accuracy = 0
            for row in range(len(test['test_data'])):
                for col in range(len(test['test_data'][0])):
                    if len(results['table_data']['row_data'])-1>row and len(results['table_data']['row_data'][row])-1>col: 
                        count+=1
                        if test['test_data'][row][col] == results['table_data']['row_data'][row][col]:
                            accuracy+=1
            accuracy = 100*accuracy/count
            print("% Accuracy: ",accuracy,"%")
        else:
            table_found = 0
            similarity = 0
            accuracy = 0
            print("Error. Table not found.")
        print("Time Taken: %s seconds" % (time.time() - start_time))
        print("-----------------------------------------------------")
        print([print(k) for k in results['table_data']['row_data']])
        print()
        result_str.append(f"{i},{test['rot_angle']},{test['enhance_level']},{table_found},{similarity},{accuracy},{time.time() - start_time}")
        if len(result_str)>2 or i == iterations-1:
            try:
                open("test_log.csv", "r")
            except:
                f = open("test_log.csv", "a")
                f.write("Test Number,Rotation Angle,Enhance Level, Table Found,Accuracy Percentage,Time Taken\n")
            else:
                f = open("test_log.csv", "a")
            for j in result_str:
                f.write(j)
                f.write('\n')
            f.close()
            result_str = []
        

In [27]:
run_test(1)

Test  1  Parameters
Rotation Angle:  0 	Enhance Level:  0
Results
% Similarity:  64.0 %	% Accuracy:  0.0 %
Time Taken: 64.18804955482483 seconds
-----------------------------------------------------
['Generic Car', '2 PCS', '91.58', None, '81.29']
[None, '67 PCS', '78.3', None, '3.78']
[None, '69 PCS', '98.82', None, '90.75']
[None, '49 PCS', '29.91', None, '23.28']
[None, '65 PCS', '64.6', None, '72.55']
[None, None, 'Goods Totat', None, '140.28']
['Details for payment vin hank transer', None, 'Less Discount Gocs Amount', '%', '140.28']
['Bank Name United Overseas Bank LimitedA/C Name :Naspac Marketing Pe Ltd', None, '7%GST', None, None]
[None, None, 'atalincl GST', None, '150181']
['neweccn', None, None, None, 'Enof Dxament']
['RECEIVEYGOODS INGORDERAND CONDITION All cheques should be', None, None, 'ISSUED/APPROVED', None]
['crossed and made payable to', 'Kim Eng Mini Supormarket', None, None, None]
['Naspac Marketing Pte Ltd 6758 1955Cashler G753 6888Oica', '6778108', None, None, No