In [2]:
import pandas as pd

# Example data (replace with actual file paths)
ground_truth_path = "ground_truth.xlsx"
model_output_path_dsv3 = "DeepSeekV3_raw_Results.csv"



# Load data
ground_truth = pd.read_excel(ground_truth_path)
model_output_dsv3 = pd.read_csv(model_output_path_dsv3)



# Normalize attribute names
def normalize(data):
    data["Attribute"] = data["Attribute"].str.lower().str.strip()
    return data

ground_truth = normalize(ground_truth)
model_output_dsv3 = normalize(model_output_dsv3)


In [3]:
# Identify true positives, false negatives, and false positives
true_positives_dsv3 = model_output_dsv3[model_output_dsv3["Attribute"].isin(ground_truth["Attribute"])]
false_negatives_dsv3 = ground_truth[~ground_truth["Attribute"].isin(model_output_dsv3["Attribute"])]
false_positives_dsv3 = model_output_dsv3[~model_output_dsv3["Attribute"].isin(ground_truth["Attribute"])]

In [4]:
# Calculate metrics
precision_dsv3 = len(true_positives_dsv3 ) / (len(true_positives_dsv3 ) + len(false_positives_dsv3)) if len(true_positives_dsv3) + len(false_positives_dsv3) > 0 else 0
recall_dsv3 = len(true_positives_dsv3) / (len(true_positives_dsv3) + len(false_negatives_dsv3)) if len(true_positives_dsv3) + len(false_negatives_dsv3) > 0 else 0
f1_score_dsv3 = 2 * (precision_dsv3 * recall_dsv3) / (precision_dsv3+ recall_dsv3) if precision_dsv3 + recall_dsv3 > 0 else 0
coverage_dsv3 = model_output_dsv3["Attribute"].nunique() / ground_truth["Attribute"].nunique()

print(f"Precision_dsv3: {precision_dsv3:.2f}")
print(f"Recall_dsv3: {recall_dsv3:.2f}")
print(f"F1-Score_dsv3: {f1_score_dsv3:.2f}")
print(f"Coverage_dsv3: {coverage_dsv3:.2f}")

Precision_dsv3: 0.09
Recall_dsv3: 0.61
F1-Score_dsv3: 0.16
Coverage_dsv3: 8.30


In [5]:
# Merge datasets for comparison
comparison_dsv3 = ground_truth.merge(model_output_dsv3, on=["Category", "Attribute"], suffixes=("_gt", "_model"))

# Calculate deviations
comparison_dsv3["Deviation (Count)"] = comparison_dsv3["Count_model"] - comparison_dsv3["Count_gt"]
comparison_dsv3["Deviation (Percentage)"] = comparison_dsv3["Percentage_model"] - comparison_dsv3["Percentage_gt"]

print(comparison_dsv3)

   Category       Attribute  Count_gt  Percentage_gt  Count_model  \
0   grocery    packing_type        32       6.694561            5   
1   grocery          flavor        23       4.811715           12   
2   grocery    product_type        19       3.974895            4   
3   grocery           brand        57      11.924686           42   
4   grocery         organic         2       0.418410            1   
..      ...             ...       ...            ...          ...   
64   office        warranty         5       0.233536            5   
65   office         made_in         3       0.140121            1   
66   office  sustainability        21       0.980850            1   
67   office   compatibility        22       1.027557           44   
68   office   compatibility        22       1.027557            1   

    Percentage_model  Deviation (Count)  Deviation (Percentage)  
0               1.40                -27               -5.294561  
1               3.35                -11

gpt4omini

In [5]:
# Identify true positives, false negatives, and false positives
true_positives_gpt4omini = model_output_gpt4omini[model_output_gpt4omini["Attribute"].isin(ground_truth["Attribute"])]
false_negatives_gpt4omini = ground_truth[~ground_truth["Attribute"].isin(model_output_gpt4omini["Attribute"])]
false_positives_gpt4omini = model_output_gpt4omini[~model_output_gpt4omini["Attribute"].isin(ground_truth["Attribute"])]

In [6]:
# Calculate metrics
precision_gpt4omini = len(true_positives_gpt4omini) / (len(true_positives_gpt4omini) + len(false_positives_gpt4omini)) if len(true_positives_gpt4omini) + len(false_positives_gpt4omini) > 0 else 0
recall_gpt4omini = len(true_positives_gpt4omini) / (len(true_positives_gpt4omini) + len(false_negatives_gpt4omini)) if len(true_positives_gpt4omini) + len(false_negatives_gpt4omini) > 0 else 0
f1_score_gpt4omini = 2 * (precision_gpt4omini * recall_gpt4omini) / (precision_gpt4omini + recall_gpt4omini) if precision_gpt4omini + recall_gpt4omini > 0 else 0
coverage_gpt4omini = model_output_gpt4omini["Attribute"].nunique() / ground_truth["Attribute"].nunique()

print(f"Precision_gpt4omini: {precision_gpt4omini:.2f}")
print(f"Recall_gpt4omini: {recall_gpt4omini:.2f}")
print(f"F1-Score_gpt4omini: {f1_score_gpt4omini:.2f}")
print(f"Coverage_gpt4omini: {coverage_gpt4omini:.2f}")

Precision_gpt4omini: 0.18
Recall_gpt4omini: 0.15
F1-Score_gpt4omini: 0.16
Coverage_gpt4omini: 0.82


In [7]:
# Merge datasets for comparison
comparison_gpt4omini = ground_truth.merge(model_output_gpt4omini, on=["Category", "Attribute"], suffixes=("_gt", "_model"))

# Calculate deviations
comparison_gpt4omini["Deviation (Count)"] = comparison_gpt4omini["Count_model"] - comparison_gpt4omini["Count_gt"]
comparison_gpt4omini["Deviation (Percentage)"] = comparison_gpt4omini["Percentage_model"] - comparison_gpt4omini["Percentage_gt"]

print(comparison_gpt4omini)

   Category      Attribute  Count_gt  Percentage_gt  Count_model  \
0   grocery          brand        57      11.924686            1   
1      home       material       171       8.901614            1   
2      home       capacity        77       4.008329            3   
3      home          color        74       3.852160            1   
4      home  certification        77       4.008329            1   
5      home            btu        22       1.145237            1   
6   jewelry   product_type        51       6.684142            1   
7   jewelry          brand        96      12.581913            1   
8   jewelry   model_number       130      17.038008            2   
9    office       capacity        45       2.101822            2   
10   office       material       118       5.511443            1   
11   office  pack_quantity        93       4.343765            1   
12   office          brand       244      11.396544            7   
13   office   paper_weight        19       0.887

llama3.1

In [8]:
# Identify true positives, false negatives, and false positives
true_positives_llama31 = model_output_llama31[model_output_llama31["Attribute"].isin(ground_truth["Attribute"])]
false_negatives_llama31 = ground_truth[~ground_truth["Attribute"].isin(model_output_llama31["Attribute"])]
false_positives_llama31 = model_output_llama31[~model_output_llama31["Attribute"].isin(ground_truth["Attribute"])]

In [9]:
# Calculate metrics
precision_llama31 = len(true_positives_llama31) / (len(true_positives_llama31) + len(false_positives_llama31)) if len(true_positives_llama31) + len(false_positives_llama31) > 0 else 0
recall_llama31 = len(true_positives_llama31) / (len(true_positives_llama31) + len(false_negatives_llama31)) if len(true_positives_llama31) + len(false_negatives_llama31) > 0 else 0
f1_score_llama31 = 2 * (precision_llama31 * recall_llama31) / (precision_llama31 + recall_llama31) if precision_llama31 + recall_llama31 > 0 else 0
coverage_llama31 = model_output_llama31["Attribute"].nunique() / ground_truth["Attribute"].nunique()

print(f"Precision_llama31: {precision_llama31:.2f}")
print(f"Recall_llama31: {recall_llama31:.2f}")
print(f"F1-Score_llama31: {f1_score_llama31:.2f}")
print(f"Coverage_llama31: {coverage_llama31:.2f}")

Precision_llama31: 0.04
Recall_llama31: 0.42
F1-Score_llama31: 0.08
Coverage_llama31: 12.02


In [10]:
# Merge datasets for comparison
comparison_llama31 = ground_truth.merge(model_output_llama31, on=["Category", "Attribute"], suffixes=("_gt", "_model"))

# Calculate deviations
comparison_llama31["Deviation (Count)"] = comparison_llama31["Count_model"] - comparison_llama31["Count_gt"]
comparison_llama31["Deviation (Percentage)"] = comparison_llama31["Percentage_model"] - comparison_llama31["Percentage_gt"]

print(comparison_llama31)

   Category       Attribute  Count_gt  Percentage_gt  Count_model  \
0   grocery          flavor        23       4.811715            6   
1   grocery           brand        57      11.924686           16   
2   grocery           color         1       0.209205            1   
3   grocery          series        28       5.857741            3   
4      home        material       171       8.901614           73   
5      home          height        57       2.967205           11   
6      home        capacity        77       4.008329           30   
7      home           gauge        40       2.082249            6   
8      home          length        56       2.915148           19   
9      home           shape        28       1.457574           10   
10     home         voltage        26       1.353462            8   
11     home           color        74       3.852160           45   
12     home           width        75       3.904217           57   
13     home           depth       