In [1]:
import easyocr
import cv2
from tqdm import tqdm
import os
import csv
import re

reader = easyocr.Reader(['en'], gpu=True)

def extract_text_from_image(image_path):
    image = cv2.imread(image_path)
    results = reader.readtext(image)
    extracted_text = ' '.join([result[1] for result in results])
    return extracted_text

def parse_measurement(text):
    # Expanded pattern to catch more variations
    pattern = r'(\d+(?:\.\d+)?)\s*(gram|centimetre|ounce|kilogram|g|cm|oz|kg|foot|ft|inch|in|metre|m|millimetre|mm|ton|t|volt|v|watt|w|kilovolt|kv|kilowatt|kw|pound|lb|yard|yd|millivolt|mv|microgram|μg)'
    match = re.search(pattern, text.lower())
    if match:
        value, unit = match.groups()
        # Normalize units
        unit_map = {
            'g': 'gram', 'cm': 'centimetre', 'oz': 'ounce', 'kg': 'kilogram',
            'ft': 'foot', 'in': 'inch', 'm': 'metre', 'mm': 'millimetre',
            't': 'ton', 'v': 'volt', 'w': 'watt', 'kv': 'kilovolt',
            'kw': 'kilowatt', 'lb': 'pound', 'yd': 'yard', 'mv': 'millivolt',
            'μg': 'microgram'
        }
        unit = unit_map.get(unit, unit)
        return f"{float(value)} {unit}"
    return ""

image_folder = '../images'
output_file = 'test_out_alterd.csv'

with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'prediction'])
    
    for index, file in enumerate(tqdm(os.listdir(image_folder))):
        image_path = os.path.join(image_folder, file)
        try:
            extracted_text = extract_text_from_image(image_path)
            prediction = parse_measurement(extracted_text)
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
            prediction = ""
        writer.writerow([index, prediction])

print(f"CSV file '{output_file}' has been created with the predictions.")

KeyboardInterrupt: 

In [2]:
import torch
print(torch.cuda.is_available())


False


In [6]:
import pandas as pd
import numpy as np

def normalize_prediction(pred):
    if pd.isna(pred) or pred == "":
        return ""
    if isinstance(pred, (int, float)):
        return f"{pred:.2f}"
    parts = str(pred).split()
    if len(parts) != 2:
        return ""
    try:
        value = float(parts[0])
        unit = parts[1]
        return f"{value:.2f} {unit}"
    except ValueError:
        return ""

# Load the predictions and ground truth
predictions_df = pd.read_csv('test_out.csv')
ground_truth_df = pd.read_csv('../dataset/test.csv')  # Adjust the path as needed

# Ensure the dataframes are sorted by index
predictions_df = predictions_df.sort_values('index').reset_index(drop=True)
ground_truth_df = ground_truth_df.sort_values('index').reset_index(drop=True)

# Normalize predictions
predictions_df['normalized_prediction'] = predictions_df['prediction'].apply(normalize_prediction)

# Analyze predictions
total_predictions = len(predictions_df)
non_empty_predictions = (predictions_df['normalized_prediction'] != "").sum()
empty_predictions = total_predictions - non_empty_predictions

print(f"Total number of predictions: {total_predictions}")
print(f"Number of non-empty predictions: {non_empty_predictions}")
print(f"Number of empty predictions: {empty_predictions}")
print(f"Percentage of non-empty predictions: {(non_empty_predictions / total_predictions) * 100:.2f}%")

# Display some example predictions
print("\nExample predictions:")
print(predictions_df[['index', 'normalized_prediction']].head(10))

# Analyze prediction units
if non_empty_predictions > 0:
    unit_counts = predictions_df['normalized_prediction'].apply(lambda x: x.split()[-1] if x else "").value_counts()
    print("\nUnit distribution in predictions:")
    print(unit_counts)

# Join predictions with ground truth metadata
merged_df = pd.merge(ground_truth_df, predictions_df[['index', 'normalized_prediction']], on='index', how='left')

# Display some examples with metadata
print("\nExample predictions with metadata:")
print(merged_df[['index', 'image_link', 'group_id', 'entity_name', 'normalized_prediction']].head(10))

# Analyze predictions by group_id
group_analysis = merged_df.groupby('group_id').agg({
    'normalized_prediction': lambda x: (x != "").sum(),
    'index': 'count'
}).rename(columns={'normalized_prediction': 'non_empty_predictions', 'index': 'total_samples'})
group_analysis['empty_predictions'] = group_analysis['total_samples'] - group_analysis['non_empty_predictions']
group_analysis['non_empty_percentage'] = (group_analysis['non_empty_predictions'] / group_analysis['total_samples']) * 100

print("\nPrediction analysis by group_id:")
print(group_analysis)

Total number of predictions: 54
Number of non-empty predictions: 40
Number of empty predictions: 14
Percentage of non-empty predictions: 74.07%

Example predictions:
   index normalized_prediction
0      0                      
1      1                      
2      2          2100.00 watt
3      3                      
4      4            55.00 watt
5      5             0.00 inch
6      6                      
7      7                      
8      8      50.00 centimetre
9      9            6.75 pound

Unit distribution in predictions:
normalized_prediction
              14
centimetre    11
metre          7
inch           5
watt           4
pound          4
volt           3
gram           3
ton            3
Name: count, dtype: int64

Example predictions with metadata:
   index                                         image_link  group_id  \
0      0  https://m.media-amazon.com/images/I/110EibNycl...    156839   
1      1  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
2 

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

def normalize_prediction(pred):
    if pd.isna(pred) or pred == "":
        return ""
    if isinstance(pred, (int, float)):
        return f"{pred:.2f}"
    parts = str(pred).split()
    if len(parts) != 2:
        return ""
    try:
        value = float(parts[0])
        unit = parts[1]
        return f"{value:.2f} {unit}"
    except ValueError:
        return ""


predictions_df = pd.read_csv('test_out3_evening.csv')
ground_truth_df = pd.read_csv('../dataset/test.csv')  # Adjust the path as needed


predictions_df = predictions_df.sort_values('index').reset_index(drop=True)
ground_truth_df = ground_truth_df.sort_values('index').reset_index(drop=True)


predictions_df['normalized_prediction'] = predictions_df['prediction'].apply(normalize_prediction)


merged_df = pd.merge(ground_truth_df, predictions_df[['index', 'normalized_prediction']], on='index', how='left')

# Create binary labels
# Assume that if 'entity_name' is not empty, there should be a measurement
y_true = (merged_df['entity_name'] != "").astype(int)
y_pred = (merged_df['normalized_prediction'] != "").astype(int)

# Calculate F1 score, precision, and recall
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}") 
print(f"Recall: {recall:.4f}")

# Additional analysis
total_samples = len(merged_df)
true_positives = ((y_true == 1) & (y_pred == 1)).sum()
false_positives = ((y_true == 0) & (y_pred == 1)).sum()
false_negatives = ((y_true == 1) & (y_pred == 0)).sum()
true_negatives = ((y_true == 0) & (y_pred == 0)).sum()

print(f"\nTotal samples: {total_samples}")
print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
print(f"True Negatives: {true_negatives}")

# Display some example comparisons
print("\nExample comparisons:")
comparison_df = merged_df[['index', 'entity_name', 'normalized_prediction']]
comparison_df['expected_measurement'] = y_true
comparison_df['has_prediction'] = y_pred
print(comparison_df.head(10))

F1 Score: 0.7137
Precision: 1.0000
Recall: 0.5549

Total samples: 131187
True Positives: 72796
False Positives: 0
False Negatives: 58391
True Negatives: 0

Example comparisons:
   index entity_name normalized_prediction  expected_measurement  \
0      0      height                                           1   
1      1       width      40.00 centimetre                     1   
2      2      height      10.50 centimetre                     1   
3      3       depth       6.00 centimetre                     1   
4      4       depth           90.00 metre                     1   
5      5      height              1.00 ton                     1   
6      6       width      42.00 centimetre                     1   
7      7      height             4.30 inch                     1   
8      8       width                                           1   
9      9      height                                           1   

   has_prediction  
0               0  
1               1  
2             

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comparison_df['expected_measurement'] = y_true


In [9]:
import pytesseract
import cv2
from tqdm import tqdm
import os
import csv
import re

def extract_text_from_image(image_path):
    image = cv2.imread(image_path)
    text = pytesseract.image_to_string(image)
    return text

def parse_measurement(text):
    # Expanded pattern to catch more variations
    pattern = r'(\d+(?:\.\d+)?)\s*(gram|centimetre|ounce|kilogram|g|cm|oz|kg|foot|ft|inch|in|metre|m|millimetre|mm|ton|t|volt|v|watt|w|kilovolt|kv|kilowatt|kw|pound|lb|yard|yd|millivolt|mv|microgram|μg)'
    match = re.search(pattern, text.lower())
    if match:
        value, unit = match.groups()
        # Normalize units
        unit_map = {
            'g': 'gram', 'cm': 'centimetre', 'oz': 'ounce', 'kg': 'kilogram',
            'ft': 'foot', 'in': 'inch', 'm': 'metre', 'mm': 'millimetre',
            't': 'ton', 'v': 'volt', 'w': 'watt', 'kv': 'kilovolt',
            'kw': 'kilowatt', 'lb': 'pound', 'yd': 'yard', 'mv': 'millivolt',
            'μg': 'microgram'
        }
        unit = unit_map.get(unit, unit)
        return f"{float(value)} {unit}"
    return ""

image_folder = '../images'
output_file = 'test_out_tesseract.csv'

with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'prediction'])

    for index, file in enumerate(tqdm(os.listdir(image_folder))):
        image_path = os.path.join(image_folder, file)
        try:
            extracted_text = extract_text_from_image(image_path)
            prediction = parse_measurement(extracted_text)
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
            prediction = ""
        writer.writerow([index, prediction])

print(f"CSV file '{output_file}' has been created with the predictions.")

  7%|▋         | 4/54 [00:00<00:04, 11.69it/s]

Error processing 41-NCxNuBxL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 417NJrPEk+L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 417SThj+SrL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 41ADVPQgZOL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 11%|█         | 6/54 [00:00<00:03, 14.08it/s]

Error processing 41nblnEkJ3L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 41o3iis9E7L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 41pvwR9GbaL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 15%|█▍        | 8/54 [00:00<00:03, 11.76it/s]

Error processing 41uwo4PVnuL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 41ygXRvf8lL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 19%|█▊        | 10/54 [00:00<00:04,  9.87it/s]

Error processing 41zgjN+zW3L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51+oHGvSvuL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 24%|██▍       | 13/54 [00:01<00:04,  8.31it/s]

Error processing 51-WIOx5pxL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 510xYFNYQ8L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 514bY8c4ZIL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 30%|██▉       | 16/54 [00:01<00:04,  8.44it/s]

Error processing 514pScQdlCL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51BEuVR4ZzL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 33%|███▎      | 18/54 [00:02<00:04,  7.44it/s]

Error processing 51bEy0J5wLL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51cPZYLk2YL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 37%|███▋      | 20/54 [00:02<00:04,  7.13it/s]

Error processing 51EBBqNOJ1L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51fAzxNm+cL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 41%|████      | 22/54 [00:02<00:04,  6.73it/s]

Error processing 51FSlaVlejL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51H+mX2Wk7L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51jTe522S2L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 48%|████▊     | 26/54 [00:03<00:03,  8.80it/s]

Error processing 51kdBAv6ImL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51KykmLgc0L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51l6c6UcRZL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 52%|█████▏    | 28/54 [00:03<00:02, 10.09it/s]

Error processing 51oaOP8qJlL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51P0IuT6RsL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51r7U52rh7L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 59%|█████▉    | 32/54 [00:03<00:01, 11.07it/s]

Error processing 51Su6zXkAsL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51tEop-EBJL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51vwYpDz2tL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 51y79cwGJFL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 613P5cxQH4L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 65%|██████▍   | 35/54 [00:04<00:02,  6.70it/s]

Error processing 614hn5uX9MS.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 67%|██████▋   | 36/54 [00:04<00:03,  5.89it/s]

Error processing 615Cjzm6pyL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 69%|██████▊   | 37/54 [00:04<00:03,  5.50it/s]

Error processing 61C+fwVD6dL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 70%|███████   | 38/54 [00:05<00:03,  4.20it/s]

Error processing 61E2XRNSdYL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 72%|███████▏  | 39/54 [00:05<00:03,  4.11it/s]

Error processing 61G8bvWOb-L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 74%|███████▍  | 40/54 [00:05<00:03,  4.24it/s]

Error processing 61lX6IP1SVL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 61O+Yi09tyL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 78%|███████▊  | 42/54 [00:06<00:03,  3.66it/s]

Error processing 71afEPoRGsL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 80%|███████▉  | 43/54 [00:06<00:03,  3.41it/s]

Error processing 71eCfiIG-AL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 81%|████████▏ | 44/54 [00:06<00:03,  3.08it/s]

Error processing 71fWddA0+yL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 85%|████████▌ | 46/54 [00:07<00:02,  3.51it/s]

Error processing 71Qk6hR9-WL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
Error processing 71ta6wY3HtL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 87%|████████▋ | 47/54 [00:08<00:02,  2.77it/s]

Error processing 71UN1IxKp4L.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 89%|████████▉ | 48/54 [00:08<00:02,  2.60it/s]

Error processing 71UYDq4nfnL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 91%|█████████ | 49/54 [00:08<00:01,  2.94it/s]

Error processing 71v+pim0lfL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 93%|█████████▎| 50/54 [00:09<00:01,  2.83it/s]

Error processing 71WAjPMQDWL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 94%|█████████▍| 51/54 [00:09<00:01,  2.02it/s]

Error processing 81aZ2ozp1GL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 96%|█████████▋| 52/54 [00:10<00:01,  1.86it/s]

Error processing 81IYdOV0mVL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


 98%|█████████▊| 53/54 [00:11<00:00,  1.49it/s]

Error processing 81PG3ea0MOL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.


100%|██████████| 54/54 [00:12<00:00,  4.45it/s]

Error processing 81qUmRUUTTL.jpg: tesseract is not installed or it's not in your PATH. See README file for more information.
CSV file 'test_out_tesseract.csv' has been created with the predictions.





In [8]:
!pip install pytesseract



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip




In [4]:
import pandas as pd

# Load the ground truth and model output CSVs
test_df = pd.read_csv('../dataset/test.csv')
test_out_df = pd.read_csv('test_out3_all_indexes.csv')

# Ensure both CSVs have the same length and the same columns
if len(test_df) != len(test_out_df):
    raise ValueError("Mismatch between test.csv and test_out.csv row counts.")

# Assuming the column for ground truth is 'GT' and for output is 'OUT'
GT = test_df['GT']
OUT = test_out_df['OUT']

# Initialize counts
true_positives = 0
false_positives = 0
false_negatives = 0
true_negatives = 0

# Iterate over the dataset
for gt, out in zip(GT, OUT):
    if out != "" and gt != "" and out == gt:
        true_positives += 1
    elif out != "" and gt != "" and out != gt:
        false_positives += 1
    elif out != "" and gt == "":
        false_positives += 1
    elif out == "" and gt != "":
        false_negatives += 1
    elif out == "" and gt == "":
        true_negatives += 1

# Calculate Precision and Recall
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

# Calculate F1 Score
if precision + recall == 0:
    f1_score = 0
else:
    f1_score = 2 * (precision * recall) / (precision + recall)

# Output results
print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
print(f"True Negatives: {true_negatives}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")


KeyError: 'GT'

In [None]:
# Let's load and inspect the two CSV files to understand their structure, so we can implement the F1 score algorithm based on the provided logic.

import pandas as pd

# Load the two files to inspect their contents
ground_truth_file = '/mnt/data/sample_test.csv'
output_file = '/mnt/data/sample_test_out.csv'

# Load both files
ground_truth_df = pd.read_csv(ground_truth_file)
output_df = pd.read_csv(output_file)

# Display the first few rows of both files to understand their structure
ground_truth_df.head(), output_df.head()


In [8]:
# Merge the ground truth and prediction dataframes on the index
import pandas as pd

# Load the two files to inspect their contents
ground_truth_file = '../dataset/test.csv'
output_file = 'test_out3_all_indexes.csv'

# Load both files
ground_truth_df = pd.read_csv(ground_truth_file)
output_df = pd.read_csv(output_file)

# Display the first few rows of both files to understand their structure
ground_truth_df.head(), output_df.head()


merged_df = pd.merge(ground_truth_df[['index', 'entity_name']], output_df[['index', 'prediction']], on='index')

# Initialize counts for TP, FP, FN, and TN
TP = FP = FN = TN = 0

# Iterate over the merged dataframe and apply the logic
for _, row in merged_df.iterrows():
    GT = row['entity_name']
    OUT = row['prediction']
    
    if pd.notna(GT) and pd.notna(OUT):
        if GT == OUT:
            TP += 1  # True Positive: OUT != "" and GT != "" and OUT == GT
        else:
            FP += 1  # False Positive: OUT != "" and GT != "" and OUT != GT
    elif pd.isna(GT) and pd.notna(OUT):
        FP += 1  # False Positive: OUT != "" and GT == ""
    elif pd.notna(GT) and pd.isna(OUT):
        FN += 1  # False Negative: OUT == "" and GT != ""
    elif pd.isna(GT) and pd.isna(OUT):
        TN += 1  # True Negative: OUT == "" and GT == ""

# Calculate Precision, Recall, and F1 score
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

TP, FP, FN, TN, precision, recall, f1_score

print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
print(f"True Negatives: {true_negatives}")
print(f"Precision: {precision:.10f}")  # Updated to 10 decimal points
print(f"Recall: {recall:.10f}")        # Updated to 10 decimal points
print(f"F1 Score: {f1_score:.10f}") 

True Positives: 72852
False Positives: 0
False Negatives: 58335
True Negatives: 0
Precision: 0.0000000000
Recall: 0.0000000000
F1 Score: 0.0000000000


In [9]:
import pandas as pd

# Load the ground truth and model output CSVs
test_df = pd.read_csv('../dataset/test.csv')  # Adjust the path as necessary
test_out_df = pd.read_csv('test_out3_all_indexes.csv')  # Adjust the path as necessary

# Ensure both CSVs have the same length and the same columns
if len(test_df) != len(test_out_df):
    raise ValueError("Mismatch between test.csv and test_out.csv row counts.")

# Change the variable assignments to match your DataFrame structure
GT = test_df['entity_name']  # Assuming 'entity_name' is the column for ground truth
OUT = test_out_df['prediction']  # Assuming 'prediction' is the column for model output

# Initialize counts
true_positives = 0
false_positives = 0
false_negatives = 0
true_negatives = 0

# Iterate over the dataset
for gt, out in zip(GT, OUT):
    if out != "" and gt != "" and out == gt:
        true_positives += 1
    elif out != "" and gt != "" and out != gt:
        false_positives += 1
    elif out != "" and gt == "":
        false_positives += 1
    elif out == "" and gt != "":
        false_negatives += 1
    elif out == "" and gt == "":
        true_negatives += 1

# Debugging counts
print(f"TP: {true_positives}, FP: {false_positives}, FN: {false_negatives}")

# Calculate Precision and Recall
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

# Calculate F1 Score
if precision + recall == 0:
    f1_score = 0
else:
    f1_score = 2 * (precision * recall) / (precision + recall)

# Output results
print(f"True Positives: {true_positives}")
print(f"False Positives: {false_positives}")
print(f"False Negatives: {false_negatives}")
print(f"True Negatives: {true_negatives}")
print(f"Precision: {precision:.10f}")  # Updated to 10 decimal points
print(f"Recall: {recall:.10f}")        # Updated to 10 decimal points
print(f"F1 Score: {f1_score:.10f}")    # Updated to 10 decimal points

TP: 0, FP: 131187, FN: 0
True Positives: 0
False Positives: 131187
False Negatives: 0
True Negatives: 0
Precision: 0.0000000000
Recall: 0.0000000000
F1 Score: 0.0000000000


In [8]:
import pandas as pd

# Load the ground truth and model output CSVs
test_df = pd.read_csv('../dataset/sample_test.csv')
test_out_df = pd.read_csv('../dataset/sample_test_out.csv')

# Debugging: Print the first few rows of the input dataframes
print("Ground Truth DataFrame:")
print(test_df.head())
print("\nPrediction DataFrame:")
print(test_out_df.head())

# Merge the dataframes on the 'index' column
merged_df = pd.merge(test_df[['index', 'entity_name']], test_out_df, on='index')

# Debugging: Print the first few rows of the merged dataframe
print("\nMerged DataFrame:")
print(merged_df.head())

# Extract the ground truth and predictions
GT = merged_df['entity_name']
OUT = merged_df['prediction']

# Initialize counts using vectorized operations
true_positives = ((OUT != "") & (GT != "") & (OUT == GT)).sum()
false_positives = ((OUT != "") & (GT != "") & (OUT != GT)).sum() + ((OUT != "") & (GT == "")).sum()
false_negatives = ((OUT == "") & (GT != "")).sum()
true_negatives = ((OUT == "") & (GT == "")).sum()

# Debugging: Print the counts
print(f"\nTrue Positives (TP): {true_positives}")
print(f"False Positives (FP): {false_positives}")
print(f"False Negatives (FN): {false_negatives}")
print(f"True Negatives (TN): {true_negatives}")

# Calculate Precision and Recall
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

# Calculate F1 Score
if precision + recall == 0:
    f1_score = 0
else:
    f1_score = 2 * (precision * recall) / (precision + recall)

# Output results
print(f"\nPrecision: {precision:.10f}")
print(f"Recall: {recall:.10f}")
print(f"F1 Score: {f1_score:.10f}")

Ground Truth DataFrame:
   index                                         image_link  group_id  \
0      0  https://m.media-amazon.com/images/I/41-NCxNuBx...    658003   
1      1  https://m.media-amazon.com/images/I/41-NCxNuBx...    658003   
2      2  https://m.media-amazon.com/images/I/417NJrPEk+...    939426   
3      3  https://m.media-amazon.com/images/I/417SThj+Sr...    276700   
4      4  https://m.media-amazon.com/images/I/417SThj+Sr...    276700   

                     entity_name  
0                          width  
1                          depth  
2  maximum_weight_recommendation  
3                        voltage  
4                        wattage  

Prediction DataFrame:
   index        prediction
0      0         21.9 foot
1      1           10 foot
2      2               NaN
3      3   289.52 kilovolt
4      4  1078.99 kilowatt

Merged DataFrame:
   index                    entity_name        prediction
0      0                          width         21.9 foot
1      