In [1]:
import pandas as pd

# Load the model outputs
gpt4_outputs = pd.read_csv("raw_results_chatgpt4_statistic.csv")
gpt4omini_outputs = pd.read_csv("raw_results_chatgpt4omini.csv")
llama3_outputs = pd.read_csv("raw_results_llama3_1.csv")

# Load the manual mapping table
mapping_table = pd.read_excel("Manual_Mapping_Table.xlsx")

# Load the ground truth
ground_truth = pd.read_excel("ground_truth.xlsx")


In [2]:
import Levenshtein

def calculate_similarity(attribute, ground_truth_attributes, threshold=0.8):
    """
    Calculate the similarity between an extracted attribute and ground truth attributes.
    Return the best match if similarity exceeds the threshold.
    """
    best_match = None
    highest_similarity = 0
    
    for gt_attribute in ground_truth_attributes:
        # Calculate normalized Levenshtein similarity
        similarity = 1 - Levenshtein.distance(attribute, gt_attribute) / max(len(attribute), len(gt_attribute))
        if similarity > highest_similarity:
            highest_similarity = similarity
            best_match = gt_attribute
    
    # Only return the match if it meets the threshold
    if highest_similarity >= threshold:
        return best_match, highest_similarity
    return None, highest_similarity


In [3]:
def evaluate_model(model_outputs, ground_truth_attributes, threshold=0.8):
    results = []
    
    for index, row in model_outputs.iterrows():
        extracted_attribute = row["Attribute"]  # Replace "Attribute" with your column name
        category = row["Category"]  # Replace "Category" with your column name
        
        # Filter ground truth attributes by category
        gt_attributes = ground_truth[ground_truth["Category"] == category]["Attribute"]
        
        # Calculate similarity
        best_match, similarity = calculate_similarity(extracted_attribute, gt_attributes, threshold)
        results.append({
            "Extracted Attribute": extracted_attribute,
            "Best Match": best_match,
            "Similarity": similarity,
            "Category": category
        })
    
    return pd.DataFrame(results)

# Evaluate each model
gpt4_similarity_results = evaluate_model(gpt4_outputs, ground_truth, threshold=0.8)
gpt4omini_similarity_results = evaluate_model(gpt4omini_outputs, ground_truth, threshold=0.8)
llama3_similarity_results = evaluate_model(llama3_outputs, ground_truth, threshold=0.8)

# Save results to files for analysis
gpt4_similarity_results.to_csv("gpt4_similarity_results.csv", index=False)
gpt4omini_similarity_results.to_csv("gpt4omini_similarity_results.csv", index=False)
llama3_similarity_results.to_csv("llama3_similarity_results.csv", index=False)


In [4]:
def analyze_thresholds(model_outputs, ground_truth_attributes, thresholds):
    threshold_results = []
    
    for threshold in thresholds:
        # Evaluate model at current threshold
        results = evaluate_model(model_outputs, ground_truth_attributes, threshold)
        
        # Calculate metrics
        matched = results["Best Match"].notna().sum()
        total_extracted = len(results)
        total_ground_truth = len(ground_truth_attributes)
        
        precision = matched / total_extracted if total_extracted > 0 else 0
        recall = matched / total_ground_truth if total_ground_truth > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
        
        threshold_results.append({
            "Threshold": threshold,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1_score
        })
    
    return pd.DataFrame(threshold_results)

thresholds = [0.7, 0.75, 0.8, 0.85, 0.9]
gpt4_threshold_analysis = analyze_thresholds(gpt4_outputs, ground_truth, thresholds)
print(gpt4_threshold_analysis)


   Threshold  Precision    Recall  F1-Score
0       0.70   0.056812  0.809917  0.106176
1       0.75   0.052754  0.752066  0.098592
2       0.80   0.041739  0.595041  0.078007
3       0.85   0.019710  0.280992  0.036836
4       0.90   0.008696  0.123967  0.016251


In [5]:
gpt4omini_threshold_analysis = analyze_thresholds(gpt4omini_outputs, ground_truth, thresholds)
print(gpt4omini_threshold_analysis)

   Threshold  Precision    Recall  F1-Score
0       0.70       0.23  0.190083  0.208145
1       0.75       0.22  0.181818  0.199095
2       0.80       0.16  0.132231  0.144796
3       0.85       0.14  0.115702  0.126697
4       0.90       0.14  0.115702  0.126697


In [6]:
llama3_threshold_analysis = analyze_thresholds(llama3_outputs, ground_truth, thresholds)
print(llama3_threshold_analysis)

   Threshold  Precision    Recall  F1-Score
0       0.70   0.044355  0.454545  0.080823
1       0.75   0.041129  0.421488  0.074945
2       0.80   0.032258  0.330579  0.058780
3       0.85   0.014516  0.148760  0.026451
4       0.90   0.004839  0.049587  0.008817


In [12]:
# Extract unique attributes for each model
gpt4_attributes = list(set(gpt4_outputs["Attribute"]))
gpt4omini_attributes = list(set(gpt4omini_outputs["Attribute"]))
llama3_attributes = list(set(llama3_outputs["Attribute"]))

print(f"GPT-4: {len(gpt4_attributes)} attributes")
print(f"GPT-4o Mini: {len(gpt4omini_attributes)} attributes")
print(f"LLaMA 3.1: {len(llama3_attributes)} attributes")

NameError: name 'gpt4_output' is not defined