In [44]:
import re
import easyocr
import pandas as pd
from pathlib import Path
import random
import cv2
import matplotlib.pyplot as plt
import torch
import numpy as np
from PIL import Image, ImageEnhance
from io import BytesIO




In [45]:

def display_comparison(original_image, preprocessed_image):
	fig, axes = plt.subplots(1, 2, figsize=(12, 6))

	# Original Image
	axes[0].imshow(original_image)
	axes[0].set_title('Original Image')
	axes[0].axis('off')

	# Preprocessed Image
	axes[1].imshow(preprocessed_image, cmap='gray')
	axes[1].set_title('Preprocessed Image')
	axes[1].axis('off')

	plt.tight_layout()
	plt.show()


def preprocess_image(image, target_size=(1200, 900), contrast_factor=1.5):
	open_cv_image = np.array(image)

	# Resize the image
	resized_image = cv2.resize(open_cv_image, target_size, interpolation=cv2.INTER_LINEAR)

	enhancer = ImageEnhance.Contrast(image)
	enhanced_image = enhancer.enhance(contrast_factor)
	open_cv_image = np.array(enhanced_image)

	gray_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)

	clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
	clahe_image = clahe.apply(gray_image)

	binary_image = cv2.adaptiveThreshold(clahe_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

	blurred_image = cv2.medianBlur(binary_image, 5)

	# Edge detection
	edges = cv2.Canny(blurred_image, 100, 200)

	# Convert the preprocessed image back to PIL
	preprocessed_pil_image = Image.fromarray(edges)

	return preprocessed_pil_image, enhanced_image


In [46]:

def extract_text_from_image(image_path, use_cuda=True):
	reader = easyocr.Reader(['en'], gpu=use_cuda)
	result = reader.readtext(image_path)
	return result


def clean_extracted_text(extracted_text):
	cleaned_data = []
	# Patterns

	single_number_unit_pattern = r'.*?(\d+(\.\d+)?|\d+,\d+)\s*(CM|FT|IN|MM|MG|KG|UG|MG|G|OZ|LB|TON|KV|MV|V|W|KW|CL|CU_FT|CU_IN|CUP|DL|FL_OZ|GAL|IMP_GAL|L|UL|ML|PT|QT|YD|H|cm|ft|in|mm|mg|kg|ug|g|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|yd|h).*?'
	range_pattern = r'(\d+(\.\d+)?|\d+,\d+)\s*(CM|FT|IN|MM|MG|KG|UG|MG|G|OZ|LB|TON|KV|MV|V|W|KW|CL|CU_FT|CU_IN|CUP|DL|FL_OZ|GAL|IMP_GAL|L|UL|ML|PT|QT|YD|H|cm|ft|in|mm|mg|kg|ug|g|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|yd|h)\s*to\s*(\d+(\.\d+)?|\d+,\d+)\s*(CM|FT|IN|MM|MG|KG|UG|MG|G|OZ|LB|TON|KV|MV|V|W|KW|CL|CU_FT|CU_IN|CUP|DL|FL_OZ|GAL|IMP_GAL|L|UL|ML|PT|QT|YD|H|cm|ft|in|mm|mg|kg|ug|g|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|yd|h)'
	multiple_numbers_pattern = r'((\d+(\.\d+)?|\d+,\d+)(,\s*\d+(\.\d+)?|\d+,\d+)*?)\s*(CM|FT|IN|MM|MG|KG|UG|MG|G|OZ|LB|TON|KV|MV|V|W|KW|CL|CU_FT|CU_IN|CUP|DL|FL_OZ|GAL|IMP_GAL|L|UL|ML|PT|QT|YD|H|cm|ft|in|mm|mg|kg|ug|g|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|yd|h)'
	bracketed_range_pattern = r'\[\s*(\d+(\.\d+)?|\d+,\d+)\s*,\s*(\d+(\.\d+)?|\d+,\d+)\s*\]\s*(CM|FT|IN|MM|MG|KG|UG|MG|G|OZ|LB|TON|KV|MV|V|W|KW|CL|CU_FT|CU_IN|CUP|DL|FL_OZ|GAL|IMP_GAL|L|UL|ML|PT|QT|YD|H|cm|ft|in|mm|mg|kg|ug|g|oz|lb|ton|kv|mv|v|w|kw|cl|cu_ft|cu_in|cup|dl|fl_oz|gal|imp_gal|l|ul|ml|pt|qt|yd|h)'

	for text in extracted_text:
		match = re.match(range_pattern, text[1])
		if match:
			cleaned_data.append((float(match.group(1).replace(',', '.')), match.group(3)))
			cleaned_data.append((float(match.group(4).replace(',', '.')), match.group(6)))
		else:
			match = re.match(single_number_unit_pattern, text[1])
			if match:
				cleaned_data.append((float(match.group(1).replace(',', '.')), match.group(3)))
			else:
				match = re.match(multiple_numbers_pattern, text[1])
				if match:
					numbers = match.group(1).split(',')
					for number in numbers:
						cleaned_data.append((float(number.strip().replace(',', '.')), match.group(6)))
				else:
					match = re.match(bracketed_range_pattern, text[1])
					if match:
						cleaned_data.append((float(match.group(1).replace(',', '.')), match.group(5)))
						cleaned_data.append((float(match.group(3).replace(',', '.')), match.group(5)))
	return cleaned_data


def map_units(cleaned_data):
	unit_conversion_map = {
		'cm': 'centimetre',
		'CM': 'centimetre',
		'ft': 'foot',
		'FT': 'foot',
		'in': 'inch',
		'IN': 'inch',
		'm': 'metre',
		'M': 'metre',
		'mm': 'millimetre',
		'MM': 'millimetre',
		'yd': 'yard',
		'YD': 'yard',
		'g': 'gram',
		'G': 'gram',
		'kg': 'kilogram',
		'KG': 'kilogram',
		'ug': 'microgram',
		'UG': 'microgram',
		'mg': 'milligram',
		'MG': 'milligram',
		'oz': 'ounce',
		'OZ': 'ounce',
		'lb': 'pound',
		'LB': 'pound',
		'ton': 'ton',
		'TON': 'ton',
		'kv': 'kilovolt',
		'KV': 'kilovolt',
		'mv': 'millivolt',
		'MV': 'millivolt',
		'v': 'volt',
		'V': 'volt',
		'w': 'watt',
		'W': 'watt',
		'kw': 'kilowatt',
		'KW': 'kilowatt',
		'cl': 'centilitre',
		'CL': 'centilitre',
		'cu_ft': 'cubic foot',
		'CU_FT': 'cubic foot',
		'cu_in': 'cubic inch',
		'CU_IN': 'cubic inch',
		'cup': 'cup',
		'CUP': 'cup',
		'dl': 'decilitre',
		'DL': 'decilitre',
		'fl_oz': 'fluid ounce',
		'FL_OZ': 'fluid ounce',
		'gal': 'gallon',
		'GAL': 'gallon',
		'imp_gal': 'imperial gallon',
		'IMP_GAL': 'imperial gallon',
		'l': 'litre',
		'L': 'litre',
		'ul': 'microlitre',
		'UL': 'microlitre',
		'ml': 'millilitre',
		'ML': 'millilitre',
		'pt': 'pint',
		'PT': 'pint',
		'qt': 'quart',
		'QT': 'quart',
		'h': 'hour',
		'H': 'hour'
	}
	allowed_units = set(unit_conversion_map.values())
	mapped_data = []
	for number, unit in cleaned_data:
		if unit in unit_conversion_map:
			mapped_unit = unit_conversion_map[unit]
			if mapped_unit in allowed_units:
				mapped_data.append((number, mapped_unit))
	return mapped_data


def process_images(df):
	extracted_data = []
	cleaned_data = []

	i = 0
	# Iterate over each row in the DataFrame
	for index, row in df.iterrows():
		try:
			image_path = row['image_path']
			# print(image_path, row["image_name"])
			if row["image_name"] in image_path:
				# print("Image path found", row["image_name"], row["image_path"])

				if pd.notna(image_path):  # Check if image path exists

					# print("HERE !!!!!!!!!!")

					# Step 1: Preprocess the image
					original_image = Image.open(image_path)
					preprocessed_image, enhanced_image = preprocess_image(original_image)

					# Save the enhanced image to a temporary file
					temp_image_path = 'temps/temp_enhanced_image.jpg'
					enhanced_image = enhanced_image.convert('RGB')
					enhanced_image.save(temp_image_path)

					# Step 2: Perform OCR on the image and clean the text
					extracted_text = extract_text_from_image(temp_image_path, use_cuda=True)
					cleaned_text = clean_extracted_text(extracted_text)
					mapped_text = map_units(cleaned_text)
					
					# print(f"Mapped = {extracted_text}")
					
					

					torch.cuda.empty_cache()
					torch.cuda.synchronize()

					# Append the results to the lists
					extracted_data.append(mapped_text)
					cleaned_data.append(cleaned_text)

					# Display the original and enhanced images
					# display_comparison(original_image, enhanced_image)
				else:
					cleaned_data.append("")
					extracted_data.append("")
		except Exception as e:
			# print(e)
			cleaned_data.append("")
			extracted_data.append("")
			# print(f"Error processing image {image_path}: {e}")

	
	# Step 4: Add the extracted data as a new column in the DataFrame
	df['extracted_text'] = extracted_data
	df['cleaned_text'] = cleaned_data

	##########################################################################################
	# df.to_csv('outputs/test_out.csv', index=True)				Execute safely
	##########################################################################################
	return df




In [47]:
# from src.utils import download_images
#     # Read the CSV file
# train_df = pd.read_csv(r"dataset/test.csv")
# 
# 
# # Extract the image links
# image_links = train_df['image_link'].tolist()
# # image_links = image_links[]
# 
# 
# # Specify the download folder
# download_folder = 'downs'
# 
# # Call the download_images function
# download_images(image_links, download_folder, allow_multiprocessing=False)
# 


In [48]:
df = pd.read_csv(r"dataset/test.csv")
df.rename(columns={"Unnamed: 0": "index"}, inplace=True)
df = df[["image_link"]]


df['image_name'] = df['image_link'].apply(lambda x: x.split("/")[-1])


download_folder = Path('downloads')
image_paths = list(download_folder.glob('*.jpg'))

# getting the image names from the folder paths
# folder_image_names = {str(image_path).split("/")[-1]: str(image_path) for image_path in image_paths}

# Maping them the DataFrame image names to the corresponding image paths
# df['image_path'] = df['image_name'].map(folder_image_names)
df['image_path'] = "downloads/" + df['image_name']

# Step 6: Check the result
df = df[['image_name', 'image_path']]
# process_images(df)
df

Unnamed: 0,image_name,image_path
0,110EibNyclL.jpg,downloads/110EibNyclL.jpg
1,11TU2clswzL.jpg,downloads/11TU2clswzL.jpg
2,11gHj8dhhrL.jpg,downloads/11gHj8dhhrL.jpg
3,11lshEUmCrL.jpg,downloads/11lshEUmCrL.jpg
4,21+i52HRW4L.jpg,downloads/21+i52HRW4L.jpg
...,...,...
90661,A1q3da5vzbL.jpg,downloads/A1q3da5vzbL.jpg
90662,A1q8C45g+0L.jpg,downloads/A1q8C45g+0L.jpg
90663,A1rVsIzEtkL.jpg,downloads/A1rVsIzEtkL.jpg
90664,A1rdvZ5zDdL.jpg,downloads/A1rdvZ5zDdL.jpg


In [49]:

# import pandas as pd
# train_df = pd.read_csv('dataset/test.csv')
# 
# # Check if the dataset has duplicates
# if train_df.duplicated(subset=['image_link' ]).any():
#     print("Duplicates found. Replacing train.csv with unique entries...")
# 
#     # Remove duplicates
#     train_df.drop_duplicates(subset=['image_link'], inplace=True)
# 
#     # Save the updated DataFrame to the original file location (overwrite train.csv)
#     train_df.to_csv('dataset/test.csv', index=False)
# 
# print(f"Final number of rows: {len(train_df)}")

In [50]:
# out = process_images(df)

In [51]:
# out.sample()

In [52]:
# extract_text_from_image("temps/temp_enhanced_image.jpg")

In [53]:
import torch

print(torch.version.cuda)  # Check the version of CUDA being used
print(torch.cuda.is_available())  # Should return True
print(torch.backends.cudnn.enabled)  # Should return True if CUDNN is working

12.4
True
True


In [54]:
def get_ground_truth(filename: 'str', df_to_check: 'pd.DataFrame'):
	actual_path = f"{filename.split('/')[1]}"

	# print(actual_path)

	return df_to_check[df_to_check["image_name"] == actual_path]

In [55]:
# train_df = pd.read_csv("dataset/train.csv")
# train_df["image_name"] = train_df["image_link"].apply(lambda link: link.split("/")[-1])
# train_df = train_df.drop(["image_link"], axis=1)
# train_df["image_path"] = train_df["image_name"].apply(lambda path: "test/" + path)
# 
# out_df = pd.read_csv("outputs/test_out.csv")
# out_df = out_df.drop(["Unnamed: 0"], axis=1)
# 
# # This means all the images in the out_df are present in the OCR applied images
# list(filter(lambda x: x[0] != x[1], list(zip(out_df["image_name"].to_list(), train_df["image_name"].to_list()))))
# 


# get_ground_truth("downloads/31EvJszFVfL.jpg", train_df)
# get_ground_truth("downloads/31EvJszFVfL.jpg", train_df)
# 
# get_ground_truth("downloads/31EvJszFVfL.jpg", out_df)

In [56]:
# Dropping rows for which the prediction is NaN

# out_df = out_df.dropna(axis=0)

In [57]:

# out_df.columns

In [58]:
# train_df.columns

In [59]:
# out_df["cleaned_text"].to_list()

In [60]:
# out_df.info()


In [61]:
# import ast
# 
# # Changing the cols from str to list
# 
# out_df['extracted_text'] = out_df['extracted_text'].apply(ast.literal_eval)
# out_df['cleaned_text'] = out_df['cleaned_text'].apply(ast.literal_eval)
# 

In [62]:
# out_df.info()

In [63]:
allowed_units = {"centilitre", "centimetre", "cubicfoot", "cubicinch", "cup", "decilitre", "fluidounce", "foot", "gallon", "gram", "imperialgallon", "inch", "kilogram", "kilovolt", "kilowatt", "litre", "metre", "microgram", "microlitre", "milligram", "millilitre", "millimetre", "millivolt", "ounce", "pint", "pound", "quart", "ton", "volt", "watt", "yard"}
# Define the allowed units
allowed_units = {
	"centilitre", "centimetre", "cubicfoot", "cubicinch", "cup", "decilitre", 
	"fluidounce", "foot", "gallon", "gram", "imperialgallon", "inch", "kilogram", 
	"kilovolt", "kilowatt", "litre", "metre", "microgram", "microlitre", 
	"milligram", "millilitre", "millimetre", "millivolt", "ounce", "pint", 
	"pound", "quart", "ton", "volt", "watt", "yard"
}

# Define the abbreviation mapping
mapping = {
	"cl": "centilitre",
	"cm": "centimetre",
	"ft": "foot",
	"in": "inch",
	"g": "gram",
	"kg": "kilogram",
	"l": "litre",
	"m": "metre",
	"mg": "milligram",
	"ml": "millilitre",
	"oz": "ounce",
	"pt": "pint",
	"qt": "quart",
	"lb": "pound",
	"gal": "gallon",
	"cf": "cubicfoot",
	"ci": "cubicinch",
	"floz": "fluidounce",
	"dl": "decilitre",
	"µg": "microgram",
	"µl": "microlitre",
	"mv": "millivolt",
	"kv": "kilovolt",
	"kw": "kilowatt",
	"w": "watt",
	"yd": "yard",
	"t": "ton",
	"imperialgal": "imperialgallon"
}

mapping


def map_shorthand_unit_to_full(pair: 'tuple[str, str]'):
	
	number = pair[0]
	unit = pair[1]

	print(f"Number={number}, unit={unit}")

	actual_unit = mapping.get(unit, "")

	return f"{number} {actual_unit}"

map_shorthand_unit_to_full(("9.8", "g"))
map_shorthand_unit_to_full(("38", "ml"))

Number=9.8, unit=g
Number=38, unit=ml


'38 millilitre'

In [64]:
# import tqdm
# 
# 
# full_match = 0
# partial_match = 0
# 
# for idx, row in tqdm.tqdm(out_df.iterrows(), total=out_df.shape[0]):
# 
# 	row = row.to_dict()
# 
# 	image_path = row["image_path"]
# 	image_name = row["image_name"]
# 	possible = row["extracted_text"]
# 
# 	possible = [f"{pair[0]} {pair[1].lower()}" for pair in possible]
# 
# 	ground_truth = get_ground_truth(image_path, train_df)["entity_value"].tolist()
# 
# 	# print(possible, "=>", ground_truth)
# 
# 
# 	# print(f"{image_name} => {cleaned_text_list} => {ground_truth}")
# 
# 	intersection = set(possible).intersection(set(ground_truth))
# 
# 	# print(intersection)
# 
# 	if(len(possible) == 1 and len(intersection) == 1):
# 		print(f"Full match => {possible} => {ground_truth}")
# 		full_match += 1
# 
# 	elif(len(intersection) == 1):
# 		print(f"Partial match => {possible} => {ground_truth}")
# 		partial_match += 1
# 
# 	
# 
# print("*" * 100)
# print(f"Full accuracy = {full_match / out_df.shape[0] * 100} %")
# print(f"Partial accuracy = {partial_match / out_df.shape[0] * 100} %")
# print("*" * 100)
# 
# 
# 
# 
# 	# if(row != np.nan):


# Now start with testing

In [65]:
import requests

# sample_test = pd.read_csv("dataset/sample_test.csv")
sample_test = pd.read_csv("dataset/test.csv")

sample_test["image_path"] = sample_test["image_link"].apply(lambda link: "downs/" + link.split("/")[-1])
sample_test["image_name"] = sample_test["image_link"].apply(lambda link: link.split("/")[-1])

# sample_test = sample_test.drop(["image_link"], axis=1)
sample_test

Unnamed: 0,index,image_link,group_id,entity_name,image_path,image_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,downs/110EibNyclL.jpg,110EibNyclL.jpg
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,downs/11TU2clswzL.jpg,11TU2clswzL.jpg
2,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,downs/11gHj8dhhrL.jpg,11gHj8dhhrL.jpg
3,7,https://m.media-amazon.com/images/I/11lshEUmCr...,156839,height,downs/11lshEUmCrL.jpg,11lshEUmCrL.jpg
4,8,https://m.media-amazon.com/images/I/21+i52HRW4...,478357,width,downs/21+i52HRW4L.jpg,21+i52HRW4L.jpg
...,...,...,...,...,...,...
90661,131281,https://m.media-amazon.com/images/I/A1q3da5vzb...,724618,item_weight,downs/A1q3da5vzbL.jpg,A1q3da5vzbL.jpg
90662,131282,https://m.media-amazon.com/images/I/A1q8C45g+0...,926285,item_weight,downs/A1q8C45g+0L.jpg,A1q8C45g+0L.jpg
90663,131283,https://m.media-amazon.com/images/I/A1rVsIzEtk...,721522,maximum_weight_recommendation,downs/A1rVsIzEtkL.jpg,A1rVsIzEtkL.jpg
90664,131284,https://m.media-amazon.com/images/I/A1rdvZ5zDd...,603688,item_weight,downs/A1rdvZ5zDdL.jpg,A1rdvZ5zDdL.jpg


In [66]:
# !rm test/*

# for link in sample_test["image_link"].to_list():
# 
# 
# 
# 	# continue # Remove to re-execute 
# 
# 	file_name = link.split("/")[-1]
# 	# print(file_name)
# 
# 	img = requests.get(link).content
# 
# 	with open(f"downs/{file_name}", "wb") as f:
# 		f.write(img)

In [None]:
# sample_test_out = process_images(sample_test)
sample_test = sample_test
sample_test_out = process_images(sample_test)

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


In [37]:
sample_test_out

Unnamed: 0,index,image_link,group_id,entity_name,image_path,image_name,extracted_text,cleaned_text
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,downs/110EibNyclL.jpg,110EibNyclL.jpg,[],[]
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,downs/11TU2clswzL.jpg,11TU2clswzL.jpg,"[(42.0, centimetre), (200.0, centimetre)]","[(42.0, cm), (200.0, cm)]"
2,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,downs/11gHj8dhhrL.jpg,11gHj8dhhrL.jpg,"[(10.5, centimetre), (90.0, centimetre)]","[(10.5, cm), (90.0, cm)]"
3,7,https://m.media-amazon.com/images/I/11lshEUmCr...,156839,height,downs/11lshEUmCrL.jpg,11lshEUmCrL.jpg,[],[]
4,8,https://m.media-amazon.com/images/I/21+i52HRW4...,478357,width,downs/21+i52HRW4L.jpg,21+i52HRW4L.jpg,,
...,...,...,...,...,...,...,...,...
95,139,https://m.media-amazon.com/images/I/310oxdFmgL...,913156,width,downs/310oxdFmgLL.jpg,310oxdFmgLL.jpg,"[(100.0, millimetre), (11.5, millimetre)]","[(100.0, mm), (11.5, mm)]"
96,141,https://m.media-amazon.com/images/I/310p+AOeZ6...,452717,height,downs/310p+AOeZ6L.jpg,310p+AOeZ6L.jpg,"[(3.0, centimetre), (14.0, centimetre), (3.4, ...","[(3.0, cm), (14.0, cm), (3.4, cm), (4.0, cm)]"
97,142,https://m.media-amazon.com/images/I/310qlaeUSA...,449805,depth,downs/310qlaeUSAL.jpg,310qlaeUSAL.jpg,[],[]
98,144,https://m.media-amazon.com/images/I/310rX4WoAF...,704724,depth,downs/310rX4WoAFL.jpg,310rX4WoAFL.jpg,"[(90.0, centimetre), (3.9, inch)]","[(90.0, cm), (3.9, in)]"


In [41]:
t = sample_test_out["extracted_text"].to_list()

res = []

for row  in t:
	temp = set()
	for pair in row:

		if(pair[1] in allowed_units):
			temp.add(f"{pair[0]} {pair[1]}")

		
	res.append(list(temp))

sample_test["prediction"] = res

sample_test["prediction"] = sample_test["prediction"].apply(lambda lst : "" if(len(lst) == 0) else lst[0])



final_out = sample_test.copy(deep=True)

final_out = final_out[["index", "prediction"]]
# final_out = final_out[["prediction"]]

final_out.to_csv("test_out.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_test["prediction"] = res
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_test["prediction"] = sample_test["prediction"].apply(lambda lst : "" if(len(lst) == 0) else lst[0])


In [39]:
sample_test

Unnamed: 0,index,image_link,group_id,entity_name,image_path,image_name,extracted_text,cleaned_text,prediction
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height,downs/110EibNyclL.jpg,110EibNyclL.jpg,[],[],
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width,downs/11TU2clswzL.jpg,11TU2clswzL.jpg,"[(42.0, centimetre), (200.0, centimetre)]","[(42.0, cm), (200.0, cm)]",200.0 centimetre
2,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth,downs/11gHj8dhhrL.jpg,11gHj8dhhrL.jpg,"[(10.5, centimetre), (90.0, centimetre)]","[(10.5, cm), (90.0, cm)]",90.0 centimetre
3,7,https://m.media-amazon.com/images/I/11lshEUmCr...,156839,height,downs/11lshEUmCrL.jpg,11lshEUmCrL.jpg,[],[],
4,8,https://m.media-amazon.com/images/I/21+i52HRW4...,478357,width,downs/21+i52HRW4L.jpg,21+i52HRW4L.jpg,,,
...,...,...,...,...,...,...,...,...,...
95,139,https://m.media-amazon.com/images/I/310oxdFmgL...,913156,width,downs/310oxdFmgLL.jpg,310oxdFmgLL.jpg,"[(100.0, millimetre), (11.5, millimetre)]","[(100.0, mm), (11.5, mm)]",11.5 millimetre
96,141,https://m.media-amazon.com/images/I/310p+AOeZ6...,452717,height,downs/310p+AOeZ6L.jpg,310p+AOeZ6L.jpg,"[(3.0, centimetre), (14.0, centimetre), (3.4, ...","[(3.0, cm), (14.0, cm), (3.4, cm), (4.0, cm)]",3.4 centimetre
97,142,https://m.media-amazon.com/images/I/310qlaeUSA...,449805,depth,downs/310qlaeUSAL.jpg,310qlaeUSAL.jpg,[],[],
98,144,https://m.media-amazon.com/images/I/310rX4WoAF...,704724,depth,downs/310rX4WoAFL.jpg,310rX4WoAFL.jpg,"[(90.0, centimetre), (3.9, inch)]","[(90.0, cm), (3.9, in)]",90.0 centimetre


In [26]:
# import tqdm
# 
# def sample_accuracy():
# 
# 
# 	full_match = 0
# 	partial_match = 0
# 
# 	for idx, row in tqdm.tqdm(sample_test_out.iterrows(), total=out_df.shape[0]):
# 
# 		row = row.to_dict()
# 
# 		# print(row)
# 
# 		image_path = row["image_path"]
# 		image_name = row["image_name"]
# 		possible = row["extracted_text"]
# 
# 		possible = [f"{pair[0]} {pair[1].lower()}" for pair in possible]
# 
# 		ground_truth = get_ground_truth(image_path, train_df)["entity_value"].tolist()
# 
# 		# print(possible, "=>", ground_truth)
# 
# 
# 		# print(f"{image_name} => {cleaned_text_list} => {ground_truth}")
# 
# 		intersection = set(possible).intersection(set(ground_truth))
# 
# 		# print(intersection)
# 
# 		if(len(possible) == 1 and len(intersection) == 1):
# 			print(f"Full match => {possible} => {ground_truth}")
# 			full_match += 1
# 
# 		elif(len(intersection) == 1):
# 			print(f"Partial match => {possible} => {ground_truth}")
# 			partial_match += 1
# 
# 		
# 
# 	print("*" * 100)
# 	print(f"Full accuracy = {full_match / out_df.shape[0] * 100} %")
# 	print(f"Partial accuracy = {partial_match / out_df.shape[0] * 100} %")
# 	print("*" * 100)
# 
# 
# sample_accuracy()
# 
# # train_df


  9%|▉         | 8/88 [00:00<00:02, 31.76it/s]

Full match => ['500.0 gram'] => ['500.0 gram']
Partial match => ['0.709 gram', '200.0 milligram', '100.0 milligram', '50.0 milligram', '25.0 milligram', '25.0 milligram', '25.0 milligram', '57.0 gram', '25.0 milligram', '10.0 milligram', '0.51 gram', '0.2 gram', '0.09 gram', '25.0 milligram', '5.0 milligram'] => ['0.709 gram']


 18%|█▊        | 16/88 [00:00<00:02, 28.37it/s]

Partial match => ['30.0 kilogram', '30.0 kilogram'] => ['30.0 kilogram']


 31%|███       | 27/88 [00:00<00:02, 29.13it/s]

Partial match => ['0.28 inch', '7.0 millimetre', '0.07 inch', '1.8 millimetre', '2.0 inch', '2.7 gram', '405.0 millimetre'] => ['2.7 gram']
Partial match => ['9.1 centimetre', '36.8 centimetre', '481.0 volt', '48.0 volt', '48.0 volt', '44.0 kilogram'] => ['4.1 kilogram', '48.0 volt']
Partial match => ['9.1 centimetre', '36.8 centimetre', '481.0 volt', '48.0 volt', '48.0 volt', '44.0 kilogram'] => ['4.1 kilogram', '48.0 volt']
Partial match => ['26.0 centimetre', '40.0 centimetre', '158.0 gram'] => ['158.0 gram']
Full match => ['158.0 gram'] => ['158.0 gram']


 41%|████      | 36/88 [00:01<00:01, 27.94it/s]

Partial match => ['50.0 millilitre', '18.55 gram', '50.0 millilitre'] => ['18.55 gram']
Partial match => ['50.0 millilitre', '6.0 hour', '50.0 millilitre', '18.55 gram'] => ['18.55 gram']
Partial match => ['50.0 millilitre', '18.55 gram'] => ['18.55 gram']
Partial match => ['4.0 centimetre', '26.0 gram'] => ['26.0 gram']
Partial match => ['36.0 volt', '135.0 millimetre'] => ['800.0 watt', '36.0 volt']


 49%|████▉     | 43/88 [00:01<00:01, 28.58it/s]

Partial match => ['36.0 volt', '135.0 millimetre'] => ['800.0 watt', '36.0 volt']
Full match => ['330.0 pound'] => ['330.0 pound']
Full match => ['150.0 watt'] => ['150.0 watt']
Partial match => ['150.0 watt', '305.0 millimetre', '32.0 millimetre'] => ['150.0 watt']


 57%|█████▋    | 50/88 [00:01<00:01, 30.27it/s]

Full match => ['30.0 watt'] => ['30.0 watt']
Partial match => ['65.0 watt', '30.0 watt'] => ['30.0 watt']
Partial match => ['15.5 gram', '0.0 centimetre', '3.5 centimetre'] => ['15.5 gram']
****************************************************************************************************
Full accuracy = 5.681818181818182 %
Partial accuracy = 17.045454545454543 %
****************************************************************************************************



