In [13]:
import pandas as pd
import os

# Rutas correctas
file_paths = [
    "output_v2_gemma3.csv",
    "output_v2_llama3.csv",
    "output_v2_qwen.csv"
]

# Cargar y etiquetar cada archivo
def load_and_tag_model(file_path):
    filename = os.path.basename(file_path)
    model_name = filename.split("output_")[1].replace(".csv", "")
    df = pd.read_csv(file_path)
    df["model"] = model_name
    return df

dfs = [load_and_tag_model(fp) for fp in file_paths]
df_all = pd.concat(dfs, ignore_index=True)

In [14]:
df_all.head()

Unnamed: 0,src_lang,tgt_lang,src_text,mt_text,hall_spans,class_hall,src_text_normalized,mt_text_normalized,direction,score_comet_qe,score_labse,score_laser,score_xnli,prediction,prediction_class,model
0,spa_Latn,eng_Latn,Si deseas aprender cómo lanzar un búmeran y qu...,If you want to learn how to throw a buffalo an...,If you want to learn how to throw a <<<buffalo...,2_Small_hallucination,Si deseas aprender cómo lanzar un búmeran y qu...,If you want to learn how to throw a buffalo an...,spa_Latn_eng_Latn,1.28336,-0.842943,-0.90595,-0.99853,"Okay, I understand the process. Please provide...",,v2_gemma3
1,spa_Latn,eng_Latn,"«Si se empieza a comercializar, deberíamos ten...","""If it's going to be commercialized, we should...","""If it's going to be commercialized, we should...",1_No_hallucination,"«Si se empieza a comercializar, deberíamos ten...","""If it's going to be commercialized, we should...",spa_Latn_eng_Latn,1.12454,-0.604786,-0.723758,-0.126141,"If it’s going to be commercialized, we should ...",4_Full_hallucination,v2_gemma3
2,spa_Latn,eng_Latn,"En la actualidad, brindar hospedaje y desayuno...","Today, providing luxurious accommodation and b...","Today, providing luxurious accommodation and b...",1_No_hallucination,"En la actualidad, brindar hospedaje y desayuno...","Today, providing luxurious accommodation and b...",spa_Latn_eng_Latn,0.568068,-0.93106,-0.894328,-0.999419,If you want to learn how to throw a <<<buffalo...,2_Small_hallucination,v2_gemma3
3,spa_Latn,eng_Latn,A ver como opinan los otro bilbiotecarios de l...,Let's see what the other fucked-up bilbiotecar...,Let's see what the other <<<fucked-up>>> bilbi...,3_Partial_hallucination,A ver como opinan los otro bilbiotecarios de l...,Let's see what the other fucked-up bilbiotecar...,spa_Latn_eng_Latn,1.824201,-0.883533,-0.921163,-0.995188,If you want to see what the other <<<fucked-up...,4_Full_hallucination,v2_gemma3
4,spa_Latn,eng_Latn,Siempre se ha llamado y seguirá llamando BANDO...,He's always called and he's always calling.,He's always called and he's always calling.,1_No_hallucination,Siempre se ha llamado y seguirá llamando BANDO...,He's always called and he's always calling.,spa_Latn_eng_Latn,1.973086,-0.680285,-0.559084,-0.004222,If you want to learn how to throw a <<<buffalo...,2_Small_hallucination,v2_gemma3


In [12]:
# ---------------------
# Accuracy por clase
# ---------------------
# Definir si la predicción fue correcta
df_all['correct_prediction'] = df_all['prediction_class'] == df_all['ground_truth_classification']

# Agrupar por modelo, clase real y exactitud
grouped = (
    df_all.groupby(['model', 'ground_truth_classification', 'correct_prediction'])
    .size()
    .reset_index(name='count')
)

# Calcular total por clase y modelo
grouped['total'] = grouped.groupby(['model', 'ground_truth_classification'])['count'].transform('sum')

# Calcular porcentaje
grouped['percentage'] = (grouped['count'] / grouped['total']) * 100

# Filtrar solo los aciertos (correct_prediction = True)
accuracy_by_class = grouped[grouped['correct_prediction'] == True][[
    'model', 'ground_truth_classification', 'percentage'
]].rename(columns={'percentage': 'accuracy_percent'})

# Mostrar resultados
print(accuracy_by_class.sort_values(by=['model', 'ground_truth_classification']))


             model ground_truth_classification  accuracy_percent
0         gemma22b          1_No_hallucination             100.0
1         gemma22b       2_Small_hallucination             100.0
2         gemma22b     3_Partial_hallucination             100.0
3         gemma22b        4_Full_hallucination             100.0
4   llama3.1latest          1_No_hallucination             100.0
5   llama3.1latest       2_Small_hallucination             100.0
6   llama3.1latest     3_Partial_hallucination             100.0
7   llama3.1latest        4_Full_hallucination             100.0
8        qwen2.53b          1_No_hallucination             100.0
9        qwen2.53b       2_Small_hallucination             100.0
10       qwen2.53b     3_Partial_hallucination             100.0
11       qwen2.53b        4_Full_hallucination             100.0
