In [1]:
import pandas as pd

# Example data (replace with actual file paths)
ground_truth_path = "ground_truth.xlsx"
model_output_path_gpt4 = "raw_results_chatgpt4_statistic.csv"
model_output_path_gpt4omini = "raw_results_chatgpt4omini.csv"
model_output_path_llama31 = "raw_results_llama3_1.csv"


# Load data
ground_truth = pd.read_excel(ground_truth_path)
model_output_gpt4 = pd.read_csv(model_output_path_gpt4)
model_output_gpt4omini = pd.read_csv(model_output_path_gpt4omini)
model_output_llama31 = pd.read_csv(model_output_path_llama31)



# Normalize attribute names
def normalize(data):
    data["Attribute"] = data["Attribute"].str.lower().str.strip()
    return data

ground_truth = normalize(ground_truth)
model_output_gpt4 = normalize(model_output_gpt4)
model_output_gpt4omini = normalize(model_output_gpt4omini)
model_output_llama31 = normalize(model_output_llama31)

gpt4

In [2]:
# Identify true positives, false negatives, and false positives
true_positives_gpt4 = model_output_gpt4[model_output_gpt4["Attribute"].isin(ground_truth["Attribute"])]
false_negatives_gpt4 = ground_truth[~ground_truth["Attribute"].isin(model_output_gpt4["Attribute"])]
false_positives_gpt4 = model_output_gpt4[~model_output_gpt4["Attribute"].isin(ground_truth["Attribute"])]

In [3]:
# Calculate metrics
precision_gpt4 = len(true_positives_gpt4) / (len(true_positives_gpt4) + len(false_positives_gpt4)) if len(true_positives_gpt4) + len(false_positives_gpt4) > 0 else 0
recall_gpt4 = len(true_positives_gpt4) / (len(true_positives_gpt4) + len(false_negatives_gpt4)) if len(true_positives_gpt4) + len(false_negatives_gpt4) > 0 else 0
f1_score_gpt4 = 2 * (precision_gpt4 * recall_gpt4) / (precision_gpt4 + recall_gpt4) if precision_gpt4 + recall_gpt4 > 0 else 0
coverage_gpt4 = model_output_gpt4["Attribute"].nunique() / ground_truth["Attribute"].nunique()

print(f"Precision_gpt4: {precision_gpt4:.2f}")
print(f"Recall_gpt4: {recall_gpt4:.2f}")
print(f"F1-Score_gpt4: {f1_score_gpt4:.2f}")
print(f"Coverage_gpt4: {coverage_gpt4:.2f}")

Precision_gpt4: 0.05
Recall_gpt4: 0.59
F1-Score_gpt4: 0.09
Coverage_gpt4: 15.05


In [4]:
# Merge datasets for comparison
comparison_gpt4 = ground_truth.merge(model_output_gpt4, on=["Category", "Attribute"], suffixes=("_gt", "_model"))

# Calculate deviations
comparison_gpt4["Deviation (Count)"] = comparison_gpt4["Count_model"] - comparison_gpt4["Count_gt"]
comparison_gpt4["Deviation (Percentage)"] = comparison_gpt4["Percentage_model"] - comparison_gpt4["Percentage_gt"]

print(comparison_gpt4)

   Category      Attribute  Count_gt  Percentage_gt  Count_model  \
0   grocery   packing_type        32       6.694561            1   
1   grocery   packing_type        32       6.694561            1   
2   grocery         flavor        23       4.811715           10   
3   grocery         flavor        23       4.811715            1   
4   grocery   product_type        19       3.974895            1   
5   grocery          brand        57      11.924686           24   
6   grocery         source        57      11.924686            1   
7   grocery        organic         2       0.418410            1   
8   grocery     ingredient        14       2.928870            1   
9   grocery          color         1       0.209205            1   
10  grocery         series        28       5.857741            3   
11     home       material       171       8.901614           77   
12     home       material       171       8.901614            1   
13     home         height        57       2.967

gpt4omini

In [5]:
# Identify true positives, false negatives, and false positives
true_positives_gpt4omini = model_output_gpt4omini[model_output_gpt4omini["Attribute"].isin(ground_truth["Attribute"])]
false_negatives_gpt4omini = ground_truth[~ground_truth["Attribute"].isin(model_output_gpt4omini["Attribute"])]
false_positives_gpt4omini = model_output_gpt4omini[~model_output_gpt4omini["Attribute"].isin(ground_truth["Attribute"])]

In [6]:
# Calculate metrics
precision_gpt4omini = len(true_positives_gpt4omini) / (len(true_positives_gpt4omini) + len(false_positives_gpt4omini)) if len(true_positives_gpt4omini) + len(false_positives_gpt4omini) > 0 else 0
recall_gpt4omini = len(true_positives_gpt4omini) / (len(true_positives_gpt4omini) + len(false_negatives_gpt4omini)) if len(true_positives_gpt4omini) + len(false_negatives_gpt4omini) > 0 else 0
f1_score_gpt4omini = 2 * (precision_gpt4omini * recall_gpt4omini) / (precision_gpt4omini + recall_gpt4omini) if precision_gpt4omini + recall_gpt4omini > 0 else 0
coverage_gpt4omini = model_output_gpt4omini["Attribute"].nunique() / ground_truth["Attribute"].nunique()

print(f"Precision_gpt4omini: {precision_gpt4omini:.2f}")
print(f"Recall_gpt4omini: {recall_gpt4omini:.2f}")
print(f"F1-Score_gpt4omini: {f1_score_gpt4omini:.2f}")
print(f"Coverage_gpt4omini: {coverage_gpt4omini:.2f}")

Precision_gpt4omini: 0.18
Recall_gpt4omini: 0.15
F1-Score_gpt4omini: 0.16
Coverage_gpt4omini: 0.82


In [7]:
# Merge datasets for comparison
comparison_gpt4omini = ground_truth.merge(model_output_gpt4omini, on=["Category", "Attribute"], suffixes=("_gt", "_model"))

# Calculate deviations
comparison_gpt4omini["Deviation (Count)"] = comparison_gpt4omini["Count_model"] - comparison_gpt4omini["Count_gt"]
comparison_gpt4omini["Deviation (Percentage)"] = comparison_gpt4omini["Percentage_model"] - comparison_gpt4omini["Percentage_gt"]

print(comparison_gpt4omini)

   Category      Attribute  Count_gt  Percentage_gt  Count_model  \
0   grocery          brand        57      11.924686            1   
1      home       material       171       8.901614            1   
2      home       capacity        77       4.008329            3   
3      home          color        74       3.852160            1   
4      home  certification        77       4.008329            1   
5      home            btu        22       1.145237            1   
6   jewelry   product_type        51       6.684142            1   
7   jewelry          brand        96      12.581913            1   
8   jewelry   model_number       130      17.038008            2   
9    office       capacity        45       2.101822            2   
10   office       material       118       5.511443            1   
11   office  pack_quantity        93       4.343765            1   
12   office          brand       244      11.396544            7   
13   office   paper_weight        19       0.887

llama3.1

In [None]:
# Identify true positives, false negatives, and false positives
true_positives_llama31 = model_output_llama31[model_output_llama31["Attribute"].isin(ground_truth["Attribute"])]
false_negatives_llama31 = ground_truth[~ground_truth["Attribute"].isin(model_output_llama31["Attribute"])]
false_positives_llama31 = model_output_llama31[~model_output_llama31["Attribute"].isin(ground_truth["Attribute"])]

In [None]:
# Calculate metrics
precision_llama31 = len(true_positives_llama31) / (len(true_positives_llama31) + len(false_positives_llama31)) if len(true_positives_llama31) + len(false_positives_llama31) > 0 else 0
recall_llama31 = len(true_positives_llama31) / (len(true_positives_llama31) + len(false_negatives_llama31)) if len(true_positives_llama31) + len(false_negatives_llama31) > 0 else 0
f1_score_llama31 = 2 * (precision_llama31 * recall_llama31) / (precision_llama31 + recall_llama31) if precision_llama31 + recall_llama31 > 0 else 0
coverage_llama31 = model_output_llama31["Attribute"].nunique() / ground_truth["Attribute"].nunique()

print(f"Precision_llama31: {precision_llama31:.2f}")
print(f"Recall_llama31: {recall_llama31:.2f}")
print(f"F1-Score_llama31: {f1_score_llama31:.2f}")
print(f"Coverage_llama31: {coverage_llama31:.2f}")

In [None]:
# Merge datasets for comparison
comparison_llama31 = ground_truth.merge(model_output_llama31, on=["Category", "Attribute"], suffixes=("_gt", "_model"))

# Calculate deviations
comparison_llama31["Deviation (Count)"] = comparison_llama31["Count_model"] - comparison_llama31["Count_gt"]
comparison_llama31["Deviation (Percentage)"] = comparison_llama31["Percentage_model"] - comparison_llama31["Percentage_gt"]

print(comparison_llama31)