# Evaluation classification methods

This notebook aims to evaluate on the same 200 labeeled reviews the 3 following methods of classification used ithin our pipelines :

- Keywords extractions
- BERT model finetunned
- LLM Mistral Small 7B

In [1]:
import torch
import re 
import ollama
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score,accuracy_score
import pandas as pd
from transformers import BertTokenizer, BertModel
import logging
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor
import os

In [2]:
# Simple loger for pipeline execution
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Filterign HTTP logging
class HttpStatusFilter(logging.Filter):
    def filter(self, record):
        message = record.getMessage()
        if 'HTTP/1.1 200' not in message:
            record.levelname = "WARNING"
            record.levelno = logging.WARNING
        return 'HTTP/1.1 200' not in message
    
logging.getLogger("httpx").addFilter(HttpStatusFilter())

In [3]:
# Global variables
load_dotenv(dotenv_path="../../.env")
NUM_THREAD = int(os.environ.get("NUM_THREADS"))
logger.info(f"NUM_THREAD fixed to {NUM_THREAD}")

2025-11-25 18:44:33,230 - INFO - NUM_THREAD fixed to 8


In [4]:
# Categories
classes = ["handicap", "pet", "child"]

# Test data loading
logger.info("Loading test data...")
df = pd.read_csv("../../data/original/fine_tunning/data_test.csv")
y_true = df[classes].values.astype(int)

2025-11-25 18:44:36,309 - INFO - Loading test data...


### Evaluation of Keyword extraction

In [7]:
# Load datasets
df_truth = df 
df_kw = pd.read_csv("../../data/processed/data_categorized/key_words_data_test.csv")

# Combine keywords per review (each review now has a unique id)
df_kw = (
    df_kw.groupby("id", as_index=False)
         .agg({
             "review": "first",
             "category": lambda x: " ".join(x.astype(str))
         })
)

# Initialize predictions at 0
df_pred = pd.DataFrame(0, index=df_truth.index, columns=classes)
df_pred["id"] = df_truth["id"]

# Prediction using keywords
df_pred = df_pred.set_index("id")

for _, row in df_kw.iterrows():
    review_id = row["id"]
    cat_list = str(row["category"]).strip().lower().split()  # split in case multiple categories concatenated
    for cat in cat_list:
        if cat in classes and review_id in df_pred.index:
            df_pred.at[review_id, cat] = 1

df_pred = df_pred.reset_index()  # restore id column

# Align truth and prediction
df_truth = df_truth.sort_values("id").reset_index(drop=True)
df_pred  = df_pred.sort_values("id").reset_index(drop=True)

y_true = df_truth[classes].values
y_pred = df_pred[classes].values


# Compute metrics per class
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")

# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


Label: handicap
 Accuracy : 0.9600
 Precision: 1.0000
 Recall   : 0.8689
 F1-score : 0.9298

Label: pet
 Accuracy : 0.9550
 Precision: 0.8732
 Recall   : 1.0000
 F1-score : 0.9323

Label: child
 Accuracy : 0.9600
 Precision: 0.8769
 Recall   : 1.0000
 F1-score : 0.9344

Global metrics:
 Micro Precision: 0.9101, Recall: 0.9556, F1: 0.9322
 Macro Precision: 0.9167, Recall: 0.9563, F1: 0.9322


### Evaluation of BERT model

In [8]:
# Parameters
BERT_PATH = "../../models/bert-base-uncased"
TOKENIZER_PATH = "../bert/bert_tokenizer_pt"
MODEL_WEIGHTS = "../bert/best_weights_v3.pth"
MAX_SEQ_LEN = 256
threshold = 0.95
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model definition
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return self.sigmoid(x)


# Load model and tokenizer
logger.info("Loading tokenizer...")
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)

logger.info("Loading model...")
model = BertMultiLabelClassifier(n_classes=len(classes))
model.load_state_dict(torch.load(MODEL_WEIGHTS, map_location=device))
model.to(device)
model.eval()

# Encodign function
def encode_batch(sentences):
    encoded = tokenizer(
        list(sentences),
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_tensors="pt"
    )
    return encoded["input_ids"], encoded["attention_mask"]


# Prediction
logger.info("Predicting...")
input_ids, attention_mask = encode_batch(df["review"])

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

with torch.no_grad():
    pred = model(input_ids=input_ids, attention_mask=attention_mask).cpu().numpy()

y_pred_bin = (pred > threshold).astype(int)


# Metrics
logger.info("Metrics multilabel")

for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred_bin[:, i])
    prec = precision_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred_bin, average="micro", zero_division=0)
rec_micro = recall_score(y_true, y_pred_bin, average="micro", zero_division=0)
f1_micro = f1_score(y_true, y_pred_bin, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred_bin, average="macro", zero_division=0)
rec_macro = recall_score(y_true, y_pred_bin, average="macro", zero_division=0)
f1_macro = f1_score(y_true, y_pred_bin, average="macro", zero_division=0)

logger.info("Global metrics")
print(f"Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f"Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


2025-11-25 18:29:45,635 - INFO - Loading tokenizer...
2025-11-25 18:29:45,654 - INFO - Loading model...
2025-11-25 18:29:46,283 - INFO - Predicting...
2025-11-25 18:29:47,770 - INFO - Metrics multilabel
2025-11-25 18:29:47,787 - INFO - Global metrics


Label: handicap
Accuracy : 0.9300
Precision: 0.9608
Recall   : 0.8033
F1-score : 0.8750

Label: pet
Accuracy : 0.9300
Precision: 0.8636
Recall   : 0.9194
F1-score : 0.8906

Label: child
Accuracy : 0.9550
Precision: 0.9138
Recall   : 0.9298
F1-score : 0.9217

Micro Precision: 0.9086, Recall: 0.8833, F1: 0.8958
Macro Precision: 0.9127, Recall: 0.8842, F1: 0.8958


### Evaluation of LLM Mistral Small 7B

In [9]:
def classify_review_ollama(review_text, category, model="mistral"):
    """Classification via Ollama"""
    messages = [
        {"role": "system",
         "content": (
             "You are a strict classifier. Your task is to analyze a review and determine whether the "
             f"traveler(s) mentioned in the review have a very specific need in the category: '{category}'. "
             f"Respond strictly with 'yes' if the review indicates they travel with {category}, "
             "or 'no' if not. Your response must be ONE word only, without any explanation or extra text."
         )},
        {"role": "assistant",
         "content": "Understood. I will respond only with 'yes' or 'no', one word."},
        {"role": "user",
         "content": f"Here is the review to analyze:\n\n\"{review_text}\""}
    ]
    
    response = ollama.chat(model=model, messages=messages,options={"temperature": 0})
    answer = response["message"]["content"].strip().lower()
    cleaned = re.sub(r'[^a-z]', '', answer)
    
    return 1 if cleaned == 'yes' else 0

def classify_all_categories(review):
    return [classify_review_ollama(review, category) for category in classes]

# Prediction
with ThreadPoolExecutor(max_workers=NUM_THREAD) as executor:
    y_pred = list(executor.map(classify_all_categories, df["review"]))

# Convertir en matrice numpy si besoin :
import numpy as np
y_pred = np.array(y_pred, dtype=int)

# Metrics
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


Label: handicap
 Accuracy : 0.9550
 Precision: 0.9815
 Recall   : 0.8689
 F1-score : 0.9217

Label: pet
 Accuracy : 0.9400
 Precision: 0.8906
 Recall   : 0.9194
 F1-score : 0.9048

Label: child
 Accuracy : 0.9300
 Precision: 0.8772
 Recall   : 0.8772
 F1-score : 0.8772

Global metrics:
 Micro Precision: 0.9143, Recall: 0.8889, F1: 0.9014
 Macro Precision: 0.9164, Recall: 0.8885, F1: 0.9012


# Evaluation prompt few shot mistral small 7B

In [24]:
def classify_review_ollama(review_text, category, model="mistral"):
    """Classification with Ollama """
    
    if category == "child":
        messages = [
            {"role": "system",
             "content": (
                  "You are a strict family-review classifier. Your task is to analyze a review and determine "
                 "whether the traveler(s) are traveling with children. Especially, you need to determine"
                 " if these children have a high chance to be under 18 years old."
                 "Respond strictly with 'yes' if the review indicates people travelling with children, or 'no' if not. "
                 "ONE word only, no explanations or extra text."
             )},
            {"role": "assistant",
             "content": "Understood. I will respond only with 'yes' or 'no', one word."},
            {"role": "user",
             "content": (
                 "Here are some examples:\n"
                 "Review: \"We traveled with our kids and loved the family-friendly pool.\" -> yes\n"
                 "Review: \"The hotel was great, but we went alone as a couple.\" -> no\n"
                 "Review: \"I got there at 6:30, and a kid that apparently worked there (no id/uniform) was scrambling to set everything up\" -> no\n"
                 "Review: \"My Grand kids loved the pool\" -> yes\n"
                 "Review: \"I travelled to Dakota to see my son graduatation\" -> no\n"
                 "Review: \"This family owned business has a welcoming staff which made us feel right at home\" -> no\n"
                 "Review: \"If I had to ask one thing of Best Western, please replace the mattresses or box springs every time our kids moved at night\" -> yes\n\n"
                 f"Now classify this review:\n\"{review_text}\""
             )}
        ]
        
    elif category == "pet":
        messages = [
            {"role": "system",
             "content": (
                 "You are a strict pet-friendly-review classifier. Your task is to analyze a review and determine "
                 "whether the traveler(s) are traveling with pets. "
                 "Respond strictly with 'yes' if the review indicates they travel with pets, or 'no' if not. "
                 "ONE word only, no explanations or extra text."
             )},
            {"role": "assistant",
             "content": "Understood. I will respond only with 'yes' or 'no', one word."},
            {"role": "user",
             "content": (
                 "Here are some examples:\n"
                 "Review: \"Thanks again Cat!\" -> no\n"
                 "Review: \"I only booked this hotel because it was dog friendly\" -> yes\n"
                 "Review: \"I wanted to see if I could bring my service dog with me but they told me it was impossible at the front desk.\" -> yes\n"
                 "Review: \"The bedsheets were smelling cat urine. Horrible !\" -> no\n"
                 "Review: \"Perfect for travelers with cats or dogs.\" -> yes\n\n"
                 f"Now classify this review:\n\"{review_text}\""
             )}
        ]
        
    elif category == "handicap":
        messages = [
            {"role": "system",
             "content": (
                 "You are a strict business-travel-review classifier. Your task is to analyze a review and determine "
                 "whether the traveler(s) have any type of handicap or if the reviews contains a specific needs"
                 "associated with a disability (transporations, amenities, etc.) "
                 "Respond strictly with 'yes' if the review indicates a handicaped traveler or a special need related to handicap travelling, or 'no' if not. "
                 "ONE word only, no explanations or extra text."
             )},
            {"role": "assistant",
             "content": "Understood. I will respond only with 'yes' or 'no', one word."},
            {"role": "user",
             "content": (
                 "Here are some examples:\n"
                 "Review: \"Plant to go to London in September Need information about Accessible Van in London airport\" -> yes\n"
                 "Review: \"The room was great, big enough to move around in my power chair in both the bedroom and bathroom\" -> yes\n"
                 "Review: \"I would like to sell my wheelchair.please contact me\" -> no\n"
                 "Review: \"It's new digital travel magazine targeted exclusively for travelers with disabilities.\" -> no\n"
                 "Review: \"Nice roll-in shower with a pull-down bench, but the amenities were again too high\" -> yes\n\n"
                 f"Now classify this review:\n\"{review_text}\""
             )}
        ]
        
    else:
        raise ValueError(f"Unknown category: {category}")
    
    # Ici tu peux directement envoyer `messages` √† Ollama
    response = ollama.chat(model=model, messages=messages, options={"temperature": 0})
    answer = response["message"]["content"].strip().lower()
    cleaned = re.sub(r'[^a-z]', '', answer)
    
    return 1 if cleaned == 'yes' else 0


def classify_all_categories(review):
    return [classify_review_ollama(review, category) for category in classes]

# Prediction
with ThreadPoolExecutor(max_workers=NUM_THREAD) as executor:
    y_pred = list(executor.map(classify_all_categories, df["review"]))

# Convertir en matrice numpy si besoin :
import numpy as np
y_pred = np.array(y_pred, dtype=int)

# Metrics
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


Label: handicap
 Accuracy : 0.9750
 Precision: 0.9667
 Recall   : 0.9508
 F1-score : 0.9587

Label: pet
 Accuracy : 0.9650
 Precision: 0.9365
 Recall   : 0.9516
 F1-score : 0.9440

Label: child
 Accuracy : 0.9200
 Precision: 0.8475
 Recall   : 0.8772
 F1-score : 0.8621

Global metrics:
 Micro Precision: 0.9176, Recall: 0.9278, F1: 0.9227
 Macro Precision: 0.9169, Recall: 0.9265, F1: 0.9216


## Evluation using Human validation

In [None]:
import os
import pandas as pd
from pathlib import Path
from collections import defaultdict

BASE_DIR = Path("../../data/processed/final")
ORIGINAL_DIR = Path("../../data/processed/data_validated")

CATEGORIES = ["child", "handicap", "pet"]


def load_original_dataset(category):
    """Charge le CSV original de la cat√©gorie."""
    original_path = ORIGINAL_DIR / category
    # on cherche le ou les fichiers du dataset original
    files = list(original_path.glob("*.csv"))
    if not files:
        raise FileNotFoundError(f"Aucun fichier trouv√© pour la cat√©gorie : {category}")
    
    # S'il y a plusieurs fichiers, on les concat√®ne
    dfs = [pd.read_csv(f) for f in files]
    df = pd.concat(dfs, ignore_index=True)
    return df


def load_validated_files(category):
    """Charge les fichiers good/rejected d'une cat√©gorie."""
    folder = BASE_DIR / category
    files = list(folder.glob("*.csv"))
    return files


def process_category(category):
    original_df = load_original_dataset(category)
    validated_files = load_validated_files(category)

    all_records = []

    for file in validated_files:
        df = pd.read_csv(file)

        # D√©terminer s'il s'agit d'un fichier good ou rejected
        status_type = "good" if "good" in file.stem.lower() else "rejected"

        for _, row in df.iterrows():
            orig_idx = int(row["original_index"]) + 2  # d√©calage demand√©
            
            if orig_idx >= len(original_df):
                print(f"‚ö†Ô∏è Index hors limite dans {file.name} ({orig_idx})")
                continue
            
            orig_row = original_df.iloc[orig_idx]

            all_records.append({
                "dataset_file": file.name,
                "category": category,
                "subset": status_type,  # good / rejected
                "validation_status": orig_row.get("validation_status", None),
                "llm_child": orig_row.get("llm_child", None),
                "llm_pet": orig_row.get("llm_pet", None),
                "llm_handicap": orig_row.get("llm_handicap", None)
            })

    return pd.DataFrame(all_records)


def compute_statistics(df):
    stats = {}

    # Statistiques globales
    stats["global_counts"] = df["validation_status"].value_counts().to_dict()
    stats["global_by_subset"] = df.groupby("subset")["validation_status"].value_counts().unstack(fill_value=0)

    # Statistiques par cat√©gorie
    stats["by_category"] = df.groupby("category")["validation_status"].value_counts().unstack(fill_value=0)

    # Statistiques cat√©gorie √ó subset
    stats["by_category_and_subset"] = df.groupby(["category", "subset"])["validation_status"].value_counts().unstack(fill_value=0)

    # Statistiques par dataset (fichier)
    stats["by_dataset"] = df.groupby("dataset_file")["validation_status"].value_counts().unstack(fill_value=0)

    return stats


# MAIN WORKFLOW
all_categories_df = []

for category in CATEGORIES:
    print(f"Traitement cat√©gorie : {category}")
    df_cat = process_category(category)
    all_categories_df.append(df_cat)

final_df = pd.concat(all_categories_df, ignore_index=True)

stats = compute_statistics(final_df)

# Affichage final
print("\n========== STATISTIQUES GLOBALES ==========")
print(stats["global_counts"])
print("\n--- Par subset (good/rejected) ---")
print(stats["global_by_subset"])

print("\n========== STATISTIQUES PAR CAT√âGORIE ==========")
print(stats["by_category"])

print("\n========== STATISTIQUES PAR CAT√âGORIE ET SUBSET ==========")
print(stats["by_category_and_subset"])

print("\n========== STATISTIQUES PAR DATASET ==========")
print(stats["by_dataset"])


üìÇ Traitement cat√©gorie : child
üìÇ Traitement cat√©gorie : handicap
üìÇ Traitement cat√©gorie : pet

{'agreed': 1609, 'llm_validated': 1261, 'disputed': 50}

--- Par subset (good/rejected) ---
validation_status  agreed  disputed  llm_validated
subset                                            
good                 1016        31            819
rejected              593        19            442

validation_status  agreed  disputed  llm_validated
category                                          
child                 346        26           1025
handicap              664         0             91
pet                   599        24            145

validation_status  agreed  disputed  llm_validated
category subset                                   
child    good         228        17            661
         rejected     118         9            364
handicap good         465         0             70
         rejected     199         0             21
pet      good         323        1

In [11]:
import os
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix

BASE_DIR = Path("../../data/processed/final")
ORIGINAL_DIR = Path("../../data/processed/data_validated")

CATEGORIES = ["child", "handicap", "pet"]


# ------------------------------------------------------------
# 1. Chargement des fichiers originaux et des fichiers valid√©s
# ------------------------------------------------------------

def load_original_dataset(category):
    """Charge le CSV original de la cat√©gorie."""
    original_path = ORIGINAL_DIR / category
    files = list(original_path.glob("*.csv"))
    if not files:
        raise FileNotFoundError(f"Aucun fichier trouv√© pour la cat√©gorie : {category}")
    
    dfs = [pd.read_csv(f) for f in files]
    return pd.concat(dfs, ignore_index=True)


def load_validated_files(category):
    folder = BASE_DIR / category
    return list(folder.glob("*.csv"))


def process_category(category):
    original_df = load_original_dataset(category)
    validated_files = load_validated_files(category)

    all_records = []

    for file in validated_files:
        df = pd.read_csv(file)
        subset = "good" if "good" in file.stem.lower() else "rejected"

        for _, row in df.iterrows():
            orig_idx = int(row["original_index"]) + 2
            if orig_idx >= len(original_df):
                continue

            orig_row = original_df.iloc[orig_idx]

            all_records.append({
                "dataset_file": file.name,
                "category": category,
                "subset": subset,

                # Pipeline outputs
                "validation_status": orig_row.get("validation_status", None),

                # Predictions
                "keywords_child": orig_row.get("keywords_child", None),
                "bert_child": orig_row.get("bert_child", None),
                "llm_child": orig_row.get("llm_child", None),

                "keywords_pet": orig_row.get("keywords_pet", None),
                "bert_pet": orig_row.get("bert_pet", None),
                "llm_pet": orig_row.get("llm_pet", None),

                "keywords_handicap": orig_row.get("keywords_handicap", None),
                "bert_handicap": orig_row.get("bert_handicap", None),
                "llm_handicap": orig_row.get("llm_handicap", None),
            })

    return pd.DataFrame(all_records)


# ------------------------------------------------------------
# 2. Fusion globale des cat√©gories
# ------------------------------------------------------------

all_categories_df = []
for category in CATEGORIES:
    df_cat = process_category(category)
    all_categories_df.append(df_cat)

df = pd.concat(all_categories_df, ignore_index=True)


# ------------------------------------------------------------
# 3. Fonctions de m√©triques
# ------------------------------------------------------------

def agreement_rate(df):
    return (df.validation_status == "agreed").mean()


def disputed_rate(df):
    return (df.validation_status == "disputed").mean()


def accuracy_agreed(df):
    """On v√©rifie si agreed signifie r√©ellement coh√©rence keywords=BERT=LLM."""
    agreed_df = df[df.validation_status == "agreed"]

    acc_child = (agreed_df["keywords_child"] == agreed_df["llm_child"]).mean()
    acc_pet = (agreed_df["keywords_pet"] == agreed_df["llm_pet"]).mean()
    acc_handicap = (agreed_df["keywords_handicap"] == agreed_df["llm_handicap"]).mean()

    return {
        "child": acc_child,
        "pet": acc_pet,
        "handicap": acc_handicap,
        "global_mean": (acc_child + acc_pet + acc_handicap) / 3
    }


def bert_added_value(df):
    """Cas o√π le LLM valide BERT mais pas keywords."""
    count = 0
    total = 0

    for cat in CATEGORIES:
        kw = df[f"keywords_{cat}"]
        bert = df[f"bert_{cat}"]
        llm = df[f"llm_{cat}"]

        mask = (bert == llm) & (kw != llm)
        count += mask.sum()
        total += len(df)

    return count / total


def llm_validating_keywords(df):
    """LLM d'accord avec keywords quand keywords != BERT."""
    count = 0
    total = 0

    for cat in CATEGORIES:
        kw = df[f"keywords_{cat}"]
        bert = df[f"bert_{cat}"]
        llm = df[f"llm_{cat}"]

        mask = (kw == llm) & (kw != bert)
        count += mask.sum()
        total += len(df)

    return count / total


def llm_validating_bert(df):
    """LLM d'accord avec BERT quand keywords != BERT."""
    count = 0
    total = 0

    for cat in CATEGORIES:
        kw = df[f"keywords_{cat}"]
        bert = df[f"bert_{cat}"]
        llm = df[f"llm_{cat}"]

        mask = (bert == llm) & (kw != bert)
        count += mask.sum()
        total += len(df)

    return count / total


def clean_labels(series):
    """Nettoie et normalise les labels."""
    return (
        series
        .fillna("unknown")
        .replace("", "unknown")
        .replace("none", "unknown")
        .replace("None", "unknown")
        .astype(str)
    )

def confusion(df, model_col, cat):
    """Matrix de confusion model vs LLM (avec nettoyage robuste)."""
    gold = clean_labels(df[f"llm_{cat}"])
    pred = clean_labels(df[f"{model_col}_{cat}"])

    # Filtrer les cas o√π au moins une valeur est 'unknown'
    valid_mask = (gold != "unknown") & (pred != "unknown")
    gold_clean = gold[valid_mask]
    pred_clean = pred[valid_mask]

    if len(gold_clean) == 0:
        return "‚ö†Ô∏è No valid data for confusion matrix"

    labels = sorted(set(gold_clean) | set(pred_clean))
    return confusion_matrix(gold_clean, pred_clean, labels=labels), labels



# ------------------------------------------------------------
# 4. Calcul global des m√©triques
# ------------------------------------------------------------

results = {
    "agreement_rate": agreement_rate(df),
    "disputed_rate": disputed_rate(df),
    "accuracy_agreed": accuracy_agreed(df),
    "bert_added_value": bert_added_value(df),
    "llm_validates_keywords": llm_validating_keywords(df),
    "llm_validates_bert": llm_validating_bert(df),
    "confusion_keywords_vs_llm": {cat: confusion(df, "keywords", cat) for cat in CATEGORIES},
    "confusion_bert_vs_llm": {cat: confusion(df, "bert", cat) for cat in CATEGORIES},
}


# ------------------------------------------------------------
# 5. Affichage synth√©tique
# ------------------------------------------------------------

print("\n====== PIPELINE VALIDATION METRICS ======\n")
for key, value in results.items():
    print(f"{key}: {value}\n")




agreement_rate: 0.5510273972602739

disputed_rate: 0.017123287671232876

accuracy_agreed: {'child': np.float64(0.0), 'pet': np.float64(0.0), 'handicap': np.float64(0.0), 'global_mean': np.float64(0.0)}

bert_added_value: 0.8424657534246576

llm_validates_keywords: 0.0

llm_validates_bert: 0.8424657534246576

confusion_keywords_vs_llm: {'child': '‚ö†Ô∏è No valid data for confusion matrix', 'handicap': '‚ö†Ô∏è No valid data for confusion matrix', 'pet': '‚ö†Ô∏è No valid data for confusion matrix'}

confusion_bert_vs_llm: {'child': (array([[1420,    0],
       [1032,  468]]), ['0', '1']), 'handicap': (array([[1001,    0],
       [ 245, 1674]]), ['0', '1']), 'pet': (array([[2122,    0],
       [ 103,  695]]), ['0', '1'])}

