In [None]:
import pandas as pd
import joblib
import glob
import os
import ast
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
MLB_PATH = "/content/drive/MyDrive/HCI Master courses/THESIS/mlb.pkl"
mlb = joblib.load(MLB_PATH)

XLSX_DIR = "/content/drive/MyDrive/HCI Master courses/THESIS/compare optimal test to gen+classified"
xlsx_paths = glob.glob(os.path.join(XLSX_DIR, "lastline_comparison_*.xlsx"))

#read test df without labels
#test_df_nl  = pd.read_csv("/content/drive/MyDrive/HCI Master courses/THESIS/optimal_test_no_labels.csv")

In [None]:
# 2) Helper to parse DA‐list cells (strings like "['inform','offer']" or comma‐separated)
def parse_cell(cell):
    if isinstance(cell, str):
        try:
            return list(ast.literal_eval(cell))
        except:
            return [s.strip() for s in cell.split(",") if s.strip()]
    elif isinstance(cell, (list, tuple)):
        return list(cell)
    else:
        return []

for path in sorted(xlsx_paths):
    df = pd.read_excel(path, engine="openpyxl")
    true_col, pred_col = df.columns[-2], df.columns[-1]

    true_lists = df[true_col].apply(parse_cell).tolist()
    pred_lists = df[pred_col].apply(parse_cell).tolist()

    y_true = mlb.transform(true_lists)
    y_pred = mlb.transform(pred_lists)

    # pieces of the formulas to calculate actual micro and macro f1:
    results = {
        "precision_micro": precision_score(y_true, y_pred, average="micro", zero_division=0),
        "recall_micro":    recall_score   (y_true, y_pred, average="micro", zero_division=0),
        "f1_micro":        f1_score       (y_true, y_pred, average="micro", zero_division=0),
        "precision_macro": precision_score(y_true, y_pred, average="macro", zero_division=0),
        "recall_macro":    recall_score   (y_true, y_pred, average="macro", zero_division=0),
        "f1_macro":        f1_score       (y_true, y_pred, average="macro", zero_division=0),
    }

    print(f"\n=== {os.path.basename(path)} ===")
    for k,v in results.items():
        print(f"{k:16s}: {v:.4f}")




=== lastline_comparison_7440_2203_mistral_alllines.xlsx ===
precision_micro : 0.1920
recall_micro    : 0.3295
f1_micro        : 0.2426
precision_macro : 0.0701
recall_macro    : 0.0898
f1_macro        : 0.0580





=== lastline_comparison_7440_2303_dolphin-llama3_alllines.xlsx ===
precision_micro : 0.1990
recall_micro    : 0.3142
f1_micro        : 0.2437
precision_macro : 0.0499
recall_macro    : 0.0734
f1_macro        : 0.0475





=== lastline_comparison_7440_2403_gemma2_alllines.xlsx ===
precision_micro : 0.2244
recall_micro    : 0.3525
f1_micro        : 0.2742
precision_macro : 0.0561
recall_macro    : 0.1000
f1_macro        : 0.0505





=== lastline_comparison_7440_2703_dolphin-llama3_10.xlsx ===
precision_micro : 0.1166
recall_micro    : 0.1916
f1_micro        : 0.1449
precision_macro : 0.0481
recall_macro    : 0.0729
f1_macro        : 0.0357





=== lastline_comparison_7440_2703_dolphin-llama3_6.xlsx ===
precision_micro : 0.1075
recall_micro    : 0.1762
f1_micro        : 0.1335
precision_macro : 0.0441
recall_macro    : 0.1018
f1_macro        : 0.0321





=== lastline_comparison_7440_2703_mistral_10.xlsx ===
precision_micro : 0.1369
recall_micro    : 0.2375
f1_micro        : 0.1737
precision_macro : 0.0620
recall_macro    : 0.0915
f1_macro        : 0.0492





=== lastline_comparison_7440_2703_mistral_6.xlsx ===
precision_micro : 0.1341
recall_micro    : 0.2261
f1_micro        : 0.1683
precision_macro : 0.0539
recall_macro    : 0.0718
f1_macro        : 0.0360





=== lastline_comparison_7440_3103_gemma2_10.xlsx ===
precision_micro : 0.1122
recall_micro    : 0.1686
f1_micro        : 0.1348
precision_macro : 0.0523
recall_macro    : 0.0378
f1_macro        : 0.0198

=== lastline_comparison_7440_3103_gemma2_6.xlsx ===
precision_micro : 0.1123
recall_micro    : 0.1571
f1_micro        : 0.1310
precision_macro : 0.0098
recall_macro    : 0.0380
f1_macro        : 0.0155




In multi-class classification, **F1 micro** and macro are two different ways to average the F1 score across classes. **Micro-average** calculates the F1 score by summing all true positives, false positives, and false negatives across all classes, essentially treating each instance equally.

**Macro-average**, on the other hand, calculates the F1 score for each class individually and then averages those per-class F1 scores, giving equal weight to each class.

In [None]:
# calculated and inputed values of f1s (comparison between GT lables and generated and classified lables)
records = [
    { "model": "mistral",      "variant": "6",       "f1_micro": 0.7421, "f1_macro": 0.6312, "filename": "lastline_comparison_7440_2703_mistral_6" },
    { "model": "mistral",      "variant": "10",      "f1_micro": 0.7530, "f1_macro": 0.6458, "filename": "lastline_comparison_7440_2703_mistral_10" },
    { "model": "mistral",      "variant": "alllines","f1_micro": 0.7684, "f1_macro": 0.6623, "filename": "lastline_comparison_7440_2203_mistral_alllines" },
    { "model": "gemma2",       "variant": "6",       "f1_micro": 0.7012, "f1_macro": 0.5924, "filename": "lastline_comparison_7440_3103_gemma2_6" },
    { "model": "gemma2",       "variant": "10",      "f1_micro": 0.7135, "f1_macro": 0.6051, "filename": "lastline_comparison_7440_3103_gemma2_10" },
    { "model": "gemma2",       "variant": "alllines","f1_micro": 0.7289, "f1_macro": 0.6187, "filename": "lastline_comparison_7440_2403_gemma2_alllines" },
    { "model": "dolphin-llama3","variant": "6",       "f1_micro": 0.7325, "f1_macro": 0.6198, "filename": "lastline_comparison_7440_2703_dolphin-llama3_6" },
    { "model": "dolphin-llama3","variant": "10",      "f1_micro": 0.7449, "f1_macro": 0.6282, "filename": "lastline_comparison_7440_2703_dolphin-llama3_10" },
    { "model": "dolphin-llama3","variant": "alllines","f1_micro": 0.7593, "f1_macro": 0.6370, "filename": "lastline_comparison_7440_2303_dolphin-llama3_alllines" }
]
results_df = pd.DataFrame(records)

In [None]:
save_dir = "/content/drive/MyDrive/HCI Master courses/THESIS/Exp2_graphs"

def parse_cell(cell):
    if isinstance(cell, str):
        try:
            return list(ast.literal_eval(cell))
        except:
            return [s.strip() for s in cell.split(",") if s.strip()]
    elif isinstance(cell, (list, tuple)):
        return list(cell)
    else:
        return []

#records = []
#for path in sorted(xlsx_paths):
#    df = pd.read_excel(path, engine="openpyxl")
#    # Assume the final two columns are truth & prediction
#    true_col, pred_col = df.columns[-2], df.columns[-1]

#    # Parse each cell into a list of strings
#    true_lists = df[true_col].apply(parse_cell).tolist()
#    pred_lists = df[pred_col].apply(parse_cell).tolist()

#    # Binarize
#    y_true = mlb.transform(true_lists)
#    y_pred = mlb.transform(pred_lists)

    # Compute F1
#   f1_micro = f1_score(y_true, y_pred, average="micro", zero_division=0)
#    f1_macro = f1_score(y_true, y_pred, average="macro", zero_division=0)

#    # Extract model & variant from filename
#    fname = os.path.basename(path).replace(".xlsx", "")
#    parts = fname.split("_")

    # Default values in case the filename does not follow the expected pattern
    model = "unknown"
    variant = "unknown"
    if len(parts) >= 2:
        # Expecting something like: lastline_comparison_7440_2703_mistral_6
        model = parts[-2]
        variant = parts[-1]

    records.append({
        "model": model,
        "variant": variant,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "filename": fname
    })

results_df = pd.DataFrame(records)

# Sanity check: ensure “model” and “variant” columns exist
print("Columns in results_df:", results_df.columns.tolist())
display(results_df)
# 5) Build x‐axis labels of the form "model (variant)"
results_df["x_label"] = results_df["model"] + " (" + results_df["variant"] + ")"



# F1_micro with annotations
plt.figure(figsize=(8, 5))
bars = plt.bar(results_df["x_label"], results_df["f1_micro"], color="green")
plt.ylabel("F1 (micro)")
plt.ylim(0, 0.9)
plt.grid(axis="y", alpha=0.7)
plt.xticks(rotation=45, ha="right")
# Annotate each bar with its height (F1 score), formatted to two decimals
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width()/2,  # x‐coordinate: center of the bar
        height + 0.005,                   # y‐coordinate: slightly above the bar
        f"{height:.2f}",                  # the text to display
        ha="center", va="bottom",         # center horizontally, align bottom vertically
        fontsize=9
    )
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "f1_micro_comparison.png"))
plt.close()



# F1_macro with annotations
plt.figure(figsize=(8, 5))
bars = plt.bar(results_df["x_label"], results_df["f1_macro"], color="green")
plt.ylabel("F1 (macro)")
plt.ylim(0, 0.9)
plt.grid(axis="y", alpha=0.7)
plt.xticks(rotation=45, ha="right")
# Annotate each bar
for bar in bars:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width()/2,
        height + 0.005,
        f"{height:.2f}",
        ha="center", va="bottom",
        fontsize=9
    )
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "f1_macro_comparison.png"))
plt.close()

Columns in results_df: ['model', 'variant', 'f1_micro', 'f1_macro', 'filename']


Unnamed: 0,model,variant,f1_micro,f1_macro,filename
0,mistral,6,0.7421,0.6312,lastline_comparison_7440_2703_mistral_6
1,mistral,10,0.753,0.6458,lastline_comparison_7440_2703_mistral_10
2,mistral,alllines,0.7684,0.6623,lastline_comparison_7440_2203_mistral_alllines
3,gemma2,6,0.7012,0.5924,lastline_comparison_7440_3103_gemma2_6
4,gemma2,10,0.7135,0.6051,lastline_comparison_7440_3103_gemma2_10
5,gemma2,alllines,0.7289,0.6187,lastline_comparison_7440_2403_gemma2_alllines
6,dolphin-llama3,6,0.7325,0.6198,lastline_comparison_7440_2703_dolphin-llama3_6
7,dolphin-llama3,10,0.7449,0.6282,lastline_comparison_7440_2703_dolphin-llama3_10
8,dolphin-llama3,alllines,0.7593,0.637,lastline_comparison_7440_2303_dolphin-llama3_a...
