In [13]:
import pandas as pd
import os

# Rutas correctas
file_paths = [
    "output_v2_gemma3.csv",
    "output_v2_llama3.csv",
    "output_v2_qwen.csv"
]

# Cargar y etiquetar cada archivo
def load_and_tag_model(file_path):
    filename = os.path.basename(file_path)
    model_name = filename.split("output_")[1].replace(".csv", "")
    df = pd.read_csv(file_path)
    df["model"] = model_name
    return df

dfs = [load_and_tag_model(fp) for fp in file_paths]
df_all = pd.concat(dfs, ignore_index=True)

In [12]:
# ---------------------
# Accuracy por clase
# ---------------------
# Definir si la predicción fue correcta
df_all['correct_prediction'] = df_all['prediction_class'] == df_all['ground_truth_classification']

# Agrupar por modelo, clase real y exactitud
grouped = (
    df_all.groupby(['model', 'ground_truth_classification', 'correct_prediction'])
    .size()
    .reset_index(name='count')
)

# Calcular total por clase y modelo
grouped['total'] = grouped.groupby(['model', 'ground_truth_classification'])['count'].transform('sum')

# Calcular porcentaje
grouped['percentage'] = (grouped['count'] / grouped['total']) * 100

# Filtrar solo los aciertos (correct_prediction = True)
accuracy_by_class = grouped[grouped['correct_prediction'] == True][[
    'model', 'ground_truth_classification', 'percentage'
]].rename(columns={'percentage': 'accuracy_percent'})

# Mostrar resultados
print(accuracy_by_class.sort_values(by=['model', 'ground_truth_classification']))


             model ground_truth_classification  accuracy_percent
0         gemma22b          1_No_hallucination             100.0
1         gemma22b       2_Small_hallucination             100.0
2         gemma22b     3_Partial_hallucination             100.0
3         gemma22b        4_Full_hallucination             100.0
4   llama3.1latest          1_No_hallucination             100.0
5   llama3.1latest       2_Small_hallucination             100.0
6   llama3.1latest     3_Partial_hallucination             100.0
7   llama3.1latest        4_Full_hallucination             100.0
8        qwen2.53b          1_No_hallucination             100.0
9        qwen2.53b       2_Small_hallucination             100.0
10       qwen2.53b     3_Partial_hallucination             100.0
11       qwen2.53b        4_Full_hallucination             100.0
