In [None]:
import pandas as pd
from datasets import Dataset
from dawid_skene_model import list2array, DawidSkeneModel
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import Counter
import os

In [None]:
qa = pd.read_csv("updated_faq.csv")
display(qa)

In [None]:
# ** Concordanza di ciascun modello con il groundtruth **
model_columns = ["GPT", "DistilBERT", "MiniSBERT", "Llama", "Gemma"]
print("\nAgreement of each model with the groundtruth:")
for model in model_columns:
    print(f"{model}:")
    qa[f"Agreement {model}-Groundtruth"] = qa[model] == qa["winner"]
    print(qa[f"Agreement {model}-Groundtruth"].mean() * 100, "\n")

# Panel of 3 elements

In [None]:
# FUNZIONI AUSILIARIE
# Converti il DataFrame in una lista nidificata (dataset_list)
def dataframe_to_dataset_list(df, model_columns):
    dataset_list = []
    for _, row in df.iterrows():
        task = []
        for model in model_columns:
            response = row[model]  # Prendi la risposta del modello
            task.append([0] if response == "model_b" else [1])  # Converti in formato numerico
        dataset_list.append(task)
    return dataset_list

In [None]:
import pandas as pd
from collections import Counter

def majority_voting_analysis(qa, model_columns):
    # Majority Voting Function
    def majority_voting(row, models=model_columns):
        votes = [row[model] for model in models]
        return Counter(votes).most_common(1)[0][0]  # Most voted option
    
    # Compute Majority Voting
    qa["Majority Voting"] = qa.apply(majority_voting, axis=1)
    
    # Convert DataFrame to dataset_list
    dataset_list = dataframe_to_dataset_list(qa, model_columns)
    
    # Convert to NumPy tensor
    class_num = 2  # Two classes: Yes (1) and No (0)
    dataset_tensor = list2array(class_num, dataset_list)
    
    # Initialize and run Dawid & Skene model
    model = DawidSkeneModel(class_num=2, max_iter=40, tolerance=1e-5)
    marginal_predict, error_rates, worker_reliability, predict_label = model.run(dataset_tensor)
    
    # Convert final predictions to "Sì" or "No"
    final_answers = ["model_a" if p[1] > 0.5 else "model_b" for p in predict_label]
    
    # Add results to DataFrame
    qa["Dawid & Skene Multi-Class"] = final_answers
    
    # Compare with ground truth
    qa["Agreement D&S-Groundtruth"] = qa["Dawid & Skene Multi-Class"] == qa["winner"]
    qa["Agreement Majority-Groundtruth"] = qa["Majority Voting"] == qa["winner"]
    
    # Calculate agreement percentages
    ds_truth = qa["Agreement D&S-Groundtruth"].mean() * 100
    majority_truth = qa["Agreement Majority-Groundtruth"].mean() * 100
    
    # Create summary DataFrame
    summary_df = pd.DataFrame({
        "Metodo": ["Majority Voting", "Dawid & Skene Multi-Class"],
        "Concordanza con Groundtruth (%)": [majority_truth, ds_truth]
    })
    
    return summary_df

In [None]:
from sklearn.metrics import cohen_kappa_score
from scipy.stats import pearsonr, kendalltau
import pandas as pd

def evaluate_models(qa):
    # Convert categorical labels to numeric
    label_mapping = {label: idx for idx, label in enumerate(qa["winner"].unique())}
    
    ground_truth = qa["winner"].map(label_mapping)
    majority_voting_preds = qa["Majority Voting"].map(label_mapping)
    ds_preds = qa["Dawid & Skene Multi-Class"].map(label_mapping)

    # Compute Cohen's Kappa
    kappa_mv = cohen_kappa_score(majority_voting_preds, ground_truth)
    kappa_ds = cohen_kappa_score(ds_preds, ground_truth)

    # Compute Pearson Correlation
    pearson_mv, _ = pearsonr(majority_voting_preds, ground_truth)
    pearson_ds, _ = pearsonr(ds_preds, ground_truth)

    # Compute Kendall-Tau Correlation
    kendall_mv, _ = kendalltau(majority_voting_preds, ground_truth)
    kendall_ds, _ = kendalltau(ds_preds, ground_truth)

    # Count exact matches
    mv_matches = (majority_voting_preds == ground_truth).sum()
    ds_matches = (ds_preds == ground_truth).sum()

    total = len(qa)

    # Store results in a dictionary
    evaluation_results = {
        "Metric": ["Cohen’s Kappa", "Pearson Correlation", "Kendall-Tau", "Exact Matches"],
        "Majority Voting": [kappa_mv, pearson_mv, kendall_mv, f"{mv_matches}/{total}"],
        "Dawid & Skene": [kappa_ds, pearson_ds, kendall_ds, f"{ds_matches}/{total}"]
    }

    # Convert to DataFrame for better visualization
    eval_df = pd.DataFrame(evaluation_results)

    return eval_df

In [None]:
model_columns = ["GPT", "Llama", "Gemma"] # different foundations models, as in the papers
summary = majority_voting_analysis(qa, model_columns)
display(summary)

evaluate_models(qa)

In [None]:
model_columns = ["MiniGptBased", "Llama", "GPT"] # best performing models, without Mixtral
summary = majority_voting_analysis(qa, model_columns)
display(summary)

evaluate_models(qa)

In [None]:
model_columns = ["Mixtral", "MiniGptBased", "Llama", "GPT"] # best performing models
summary = majority_voting_analysis(qa, model_columns)
display(summary)

evaluate_models(qa)

In [None]:
model_columns = ["MiniSBERT", "DistilBERT", "Gemma", "DeepHaiku"] # worst performing models
summary = majority_voting_analysis(qa, model_columns)
display(summary)

evaluate_models(qa)

In [None]:
model_columns = ["Mixtral", "Llama", "Gemma", "MiniGptBased", "DistilBERT", "MiniSBERT", "GPT", "DeepHaiku", "PalmBonsai"]
summary = majority_voting_analysis(qa, model_columns)
display(summary)

evaluate_models(qa)

In [None]:
model_columns = ["Llama", "Gemma", "MiniGptBased", "DistilBERT", "MiniSBERT", "GPT", "DeepHaiku", "PalmBonsai"]
summary = majority_voting_analysis(qa, model_columns)
display(summary)

evaluate_models(qa)