In [1]:
import json
import pandas as pd
import numpy as np
import os
os.chdir("../")
from utils import *
import re

import glob
import itertools
from PIL import Image
from fuzzywuzzy import fuzz



In [2]:
def fuzzy_score(text, value):
    """
    Function for fuzzy matching given two strings

    Args:
        text (string): ocr data text
        value (string): entity label

    Returns:
        float : Fuzzy match score
    """

    return fuzz.ratio(text, value)

In [3]:
source_folder = '../../test_data/'

gt_labels = glob.glob(f'{source_folder}test_labels/*labels.json')
predicted_labels = glob.glob(f'{source_folder}test_outputs/*.json')
images_test = [glob.glob(e) for e in [f'{source_folder}test_images/*.jpeg', f'{source_folder}test_images/*.PNG', f'{source_folder}test_images/*.png']]
images_test = list(itertools.chain(*images_test)) #flatten the list


idx = 0
filename_gt = gt_labels[idx]
filename_label = predicted_labels[idx]
test_img = images_test[idx]


with open(f"./{filename_gt}", 'r', encoding="utf8") as f:
    data_label = json.load(f)

with open(f"./{filename_label}", 'r', encoding="utf8") as f:
    data_predicted = json.load(f)


img = Image.open(test_img)
width, height = img.size


assert len(gt_labels) == len(predicted_labels) == len(images_test)


In [26]:
df = parse_text_ocr_entities(data_predicted)

#CREATE CONSISTENCY FOR THE CATEGORIES DURING LABELLING
category_mapping_output = {"From": "Exporter Name",
"To": "Recipient Name",
}

df = df.replace({"level": category_mapping_output})
df = df.replace('\n',' ', regex=True) #clean up some messy strings

In [6]:
base_data = data_label["labels"]

#Parse original output
page_list2, text_list2,bb_list2, level_cat = parse_text_labels(base_data)

# Format text into target format per page
df2 = pd.DataFrame([page_list2, text_list2,bb_list2, level_cat]).T
df2.columns = ["page", "text","bbox", "level"]
df_label = df2.copy()

# convert bounding boxes
df_label["bbox_formatted"] = [convert_inches_pixel_normalized_vector(list(df_label["bbox"].iloc[i]), pixel_conv_x=width, pixel_conv_y=height) for i in range(len(df_label))]


# Iterate by page
text_by_page_formatted_label = []
for i in df_label["page"].unique():
    df_label_page= df_label[df_label.page == i]
    text_by_page_formatted_label.append([format_json_sublevel_label(df_label_page, u) for u in range(len(df_label_page))])

In [8]:
# #CREATE CONSISTENCY FOR THE CATEGORIES DURING LABELLING
# category_mapping = {""}

# df_label = df_label.replace({"level": category_mapping})

In [9]:
combined_df = df.merge(df_label, how="outer", left_on="level", right_on="level", suffixes=["_predicted", "_ground_truth"]).drop(columns=["bbox_predicted", "bbox_ground_truth"])


In [10]:
tt = combined_df[['text_ground_truth','level']].fillna(" ")
tt = tt.groupby(['level'], as_index=False).agg({'text_ground_truth': ' '.join})
tt = tt.merge(df, how="outer", left_on="level", right_on="level")
tt = tt.fillna(" ")
tt["word_similarity"] = [fuzzy_score(tt["text_ground_truth"][i], tt["text"][i]) for i in range(len(tt))]
tt

Unnamed: 0,level,text_ground_truth,page,text,bbox,confidence,bbox_formatted,word_similarity
0,Customs Declaration Description,,1.0,I checked that contents above are not dangerou...,"[290, 237, 321, 237, 321, 786, 290, 786]",0.626,"[290, 237, 31, 549]",1
1,Exporter Address,SERIGAYA3-23-16 YOKOHAMA SHI KONAN KU Kanagawa,1.0,SERIGAYA3-23-16 YOKOHAMA SHI KONAN KU Kanagawa,"[915, 246, 1010, 246, 1010, 596, 915, 596]",0.321,"[915, 246, 95, 350]",100
2,Exporter Name,Hideo Ogino,1.0,Hideo Ogino,"[1003, 251, 1039, 251, 1039, 433, 1003, 433]",0.855,"[1003, 251, 36, 182]",100
3,Quantity,,1.0,T shirt 3 rice cracker 1 rice cracker 2 snacks...,"[380, 201, 605, 201, 605, 1190, 380, 1190]",0.265,"[380, 201, 225, 989]",3
4,Recipient Address,10/25-27 SUBWAY ROAD ROOKDAIE SYDNEY NSW2216,1.0,10/25-27 SUBWAY ROAD ROOKDAIE SYDNEY NSW2216,"[853, 972, 907, 972, 907, 1270, 853, 1270]",0.5,"[853, 972, 54, 298]",100
5,Recipient Email Address,,,,,,,100
6,Recipient Mobile Phone Number,053 323 7618,1.0,UTEL 053 323 7618,"[598, 1452, 638, 1452, 638, 1789, 598, 1789]",0.764,"[598, 1452, 40, 337]",83
7,Recipient Name,Khamrar sa Mizue,1.0,Khamrar sa Mizue,"[901, 972, 928, 972, 928, 1167, 901, 1167]",0.864,"[901, 972, 27, 195]",100
8,Value,,1.0,JPY4500 JPY350 JPY400 JPY270 JPY90,"[382, 1375, 580, 1375, 580, 1458, 382, 1458]",0.0,"[382, 1375, 198, 83]",6
9,Weight,,,,,,,100
