In [2]:
import pandas as pd

# Load the ground truth and mapping table
ground_truth_path = "ground_truth.xlsx"
mapping_table_path = "Manual_Mapping_Table.xlsx"
ground_truth = pd.read_excel(ground_truth_path)
manual_mapping_table = pd.read_excel(mapping_table_path)

# Load the model outputs
chatgpt4_output = pd.read_csv("raw_results_chatgpt4_statistic.csv")
chatgpt4omini_output = pd.read_csv("raw_results_chatgpt4omini.csv")
llama3_output = pd.read_csv("raw_results_llama3_1.csv")

# Normalize data for consistency
manual_mapping_table["Extracted Attribute"] = manual_mapping_table["Extracted Attribute"].str.lower().str.strip()
manual_mapping_table["Mapped Attribute"] = manual_mapping_table["Mapped Attribute"].str.lower().str.strip()
manual_mapping_table["Category"] = manual_mapping_table["Category"].str.lower().str.strip()

ground_truth["Attribute"] = ground_truth["Attribute"].str.lower().str.strip()
ground_truth["Category"] = ground_truth["Category"].str.lower().str.strip()

for output in [chatgpt4_output, chatgpt4omini_output, llama3_output]:
    output["Attribute"] = output["Attribute"].str.lower().str.strip()
    output["Category"] = output["Category"].str.lower().str.strip()

# Apply manual mapping to model outputs
def apply_mapping(model_output, mapping_table):
    """
    Map extracted attributes in model output to ground truth attributes using the consolidated mapping table.
    """
    # Merge model output with the mapping table
    mapped_output = model_output.merge(
        mapping_table,
        left_on=["Category", "Attribute"],
        right_on=["Category", "Extracted Attribute"],
        how="left"
    )
    
    # Use the 'Mapped Attribute' if matched, or retain the original attribute if not
    mapped_output["Mapped Attribute"] = mapped_output["Mapped Attribute"].fillna(mapped_output["Attribute"])
    
    # Keep only relevant columns
    mapped_output = mapped_output[["Category", "Mapped Attribute", "Count", "Percentage"]]

    return mapped_output

# Apply the mapping function to each model output
mapped_chatgpt4 = apply_mapping(chatgpt4_output, manual_mapping_table)
mapped_chatgpt4omini = apply_mapping(chatgpt4omini_output, manual_mapping_table)
mapped_llama3 = apply_mapping(llama3_output, manual_mapping_table)

# Calculate evaluation metrics
def calculate_metrics(mapped_output, ground_truth):
    """
    Calculate Precision, Recall, F1-Score, and Coverage based on mapped attributes.
    """
    # True positives
    true_positives = mapped_output[mapped_output["Mapped Attribute"].isin(ground_truth["Attribute"])]
    
    # False positives
    false_positives = mapped_output[~mapped_output["Mapped Attribute"].isin(ground_truth["Attribute"])]
    
    # False negatives
    false_negatives = ground_truth[~ground_truth["Attribute"].isin(mapped_output["Mapped Attribute"])]

    # Calculate metrics
    precision = len(true_positives) / (len(true_positives) + len(false_positives)) if len(true_positives) + len(false_positives) > 0 else 0
    recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if len(true_positives) + len(false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    coverage = mapped_output["Mapped Attribute"].nunique() / ground_truth["Attribute"].nunique()

    return precision, recall, f1_score, coverage

# Evaluate metrics for each model
metrics_chatgpt4 = calculate_metrics(mapped_chatgpt4, ground_truth)
metrics_chatgpt4omini = calculate_metrics(mapped_chatgpt4omini, ground_truth)
metrics_llama3 = calculate_metrics(mapped_llama3, ground_truth)

# Print results
print(f"GPT-4 Metrics: Precision: {metrics_chatgpt4[0]:.2f}, Recall: {metrics_chatgpt4[1]:.2f}, F1-Score: {metrics_chatgpt4[2]:.2f}, Coverage: {metrics_chatgpt4[3]:.2f}")
print(f"GPT-4o Mini Metrics: Precision: {metrics_chatgpt4omini[0]:.2f}, Recall: {metrics_chatgpt4omini[1]:.2f}, F1-Score: {metrics_chatgpt4omini[2]:.2f}, Coverage: {metrics_chatgpt4omini[3]:.2f}")
print(f"LLaMA 3.1 Metrics: Precision: {metrics_llama3[0]:.2f}, Recall: {metrics_llama3[1]:.2f}, F1-Score: {metrics_llama3[2]:.2f}, Coverage: {metrics_llama3[3]:.2f}")


GPT-4 Metrics: Precision: 0.19, Recall: 0.93, F1-Score: 0.31, Coverage: 13.85
GPT-4o Mini Metrics: Precision: 0.38, Recall: 0.32, F1-Score: 0.35, Coverage: 0.77
LLaMA 3.1 Metrics: Precision: 0.20, Recall: 0.90, F1-Score: 0.33, Coverage: 10.80


In [3]:
mapped_chatgpt4.to_csv('mapped_chatgpt4.csv')

In [None]:
mapped_chatgpt4omini.to_csv('mapped_chatgpt4omini.csv')
mapped_llama3.to_csv('mapped_llama3.csv')