# Evaluation classification methods

This notebook aims to evaluate on the same 200 labeeled reviews the 3 following methods of classification used ithin our pipelines :

- Keywords extractions
- BERT model finetunned
- LLM Mistral Small 7B

In [1]:
import torch
import re 
import ollama
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score,accuracy_score
import pandas as pd
from transformers import BertTokenizer, BertModel
import logging
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor
import os

In [2]:
# Simple loger for pipeline execution
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Filterign HTTP logging
class HttpStatusFilter(logging.Filter):
    def filter(self, record):
        message = record.getMessage()
        if 'HTTP/1.1 200' not in message:
            record.levelname = "WARNING"
            record.levelno = logging.WARNING
        return 'HTTP/1.1 200' not in message
    
logging.getLogger("httpx").addFilter(HttpStatusFilter())

In [3]:
# Global variables
load_dotenv(dotenv_path="../../.env")
NUM_THREAD = int(os.environ.get("NUM_THREADS"))
logger.info(f"NUM_THREAD fixed to {NUM_THREAD}")

2025-12-02 11:53:11,164 - INFO - NUM_THREAD fixed to 10


In [4]:
# Categories
classes = ["handicap", "pet", "child"]

# Test data loading
logger.info("Loading test data...")
df = pd.read_csv("../../data/original/fine_tunning/data_test.csv")
y_true = df[classes].values.astype(int)

2025-12-02 11:53:17,603 - INFO - Loading test data...


### Evaluation of Keyword extraction

In [5]:
# Load datasets
df_truth = df 
df_kw = pd.read_csv("../../data/processed/data_categorized/key_words_data_test.csv")

# Combine keywords per review (each review now has a unique id)
df_kw = (
    df_kw.groupby("id", as_index=False)
         .agg({
             "review": "first",
             "category": lambda x: " ".join(x.astype(str))
         })
)

# Initialize predictions at 0
df_pred = pd.DataFrame(0, index=df_truth.index, columns=classes)
df_pred["id"] = df_truth["id"]

# Prediction using keywords
df_pred = df_pred.set_index("id")

for _, row in df_kw.iterrows():
    review_id = row["id"]
    cat_list = str(row["category"]).strip().lower().split()  # split in case multiple categories concatenated
    for cat in cat_list:
        if cat in classes and review_id in df_pred.index:
            df_pred.at[review_id, cat] = 1

df_pred = df_pred.reset_index()  # restore id column

# Align truth and prediction
df_truth = df_truth.sort_values("id").reset_index(drop=True)
df_pred  = df_pred.sort_values("id").reset_index(drop=True)

y_true = df_truth[classes].values
y_pred = df_pred[classes].values


# Compute metrics per class
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")

# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


Label: handicap
 Accuracy : 0.9600
 Precision: 1.0000
 Recall   : 0.8689
 F1-score : 0.9298

Label: pet
 Accuracy : 0.9550
 Precision: 0.8732
 Recall   : 1.0000
 F1-score : 0.9323

Label: child
 Accuracy : 0.9600
 Precision: 0.8769
 Recall   : 1.0000
 F1-score : 0.9344

Global metrics:
 Micro Precision: 0.9101, Recall: 0.9556, F1: 0.9322
 Macro Precision: 0.9167, Recall: 0.9563, F1: 0.9322


### Evaluation of BERT model

In [8]:
# Parameters
BERT_PATH = "../../models/bert-base-uncased"
TOKENIZER_PATH = "../bert/bert_tokenizer_pt"
MODEL_WEIGHTS = "../bert/best_weights_v3.pth"
MAX_SEQ_LEN = 256
threshold = 0.97
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model definition
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return self.sigmoid(x)


# Load model and tokenizer
logger.info("Loading tokenizer...")
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)

logger.info("Loading model...")
model = BertMultiLabelClassifier(n_classes=len(classes))
model.load_state_dict(torch.load(MODEL_WEIGHTS, map_location=device))
model.to(device)
model.eval()

# Encodign function
def encode_batch(sentences):
    encoded = tokenizer(
        list(sentences),
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_tensors="pt"
    )
    return encoded["input_ids"], encoded["attention_mask"]


# Prediction
logger.info("Predicting...")
input_ids, attention_mask = encode_batch(df["review"])

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

with torch.no_grad():
    pred = model(input_ids=input_ids, attention_mask=attention_mask).cpu().numpy()

y_pred_bin = (pred > threshold).astype(int)


# Metrics
logger.info("Metrics multilabel")

for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred_bin[:, i])
    prec = precision_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred_bin, average="micro", zero_division=0)
rec_micro = recall_score(y_true, y_pred_bin, average="micro", zero_division=0)
f1_micro = f1_score(y_true, y_pred_bin, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred_bin, average="macro", zero_division=0)
rec_macro = recall_score(y_true, y_pred_bin, average="macro", zero_division=0)
f1_macro = f1_score(y_true, y_pred_bin, average="macro", zero_division=0)

logger.info("Global metrics")
print(f"Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f"Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


2025-12-02 11:55:14,304 - INFO - Loading tokenizer...
2025-12-02 11:55:14,363 - INFO - Loading model...
2025-12-02 11:55:16,530 - INFO - Predicting...
2025-12-02 11:55:19,185 - INFO - Metrics multilabel
2025-12-02 11:55:19,273 - INFO - Global metrics


Label: handicap
Accuracy : 0.9300
Precision: 0.9796
Recall   : 0.7869
F1-score : 0.8727

Label: pet
Accuracy : 0.9350
Precision: 0.8769
Recall   : 0.9194
F1-score : 0.8976

Label: child
Accuracy : 0.9550
Precision: 0.9138
Recall   : 0.9298
F1-score : 0.9217

Micro Precision: 0.9186, Recall: 0.8778, F1: 0.8977
Macro Precision: 0.9234, Recall: 0.8787, F1: 0.8974


### Evaluation of LLM Mistral Small 7B

In [8]:
def classify_review_ollama(review_text, category, model="mistral"):
    """Classification via Ollama"""
    messages = [
        {"role": "system",
         "content": (
             "You are a strict classifier. Your task is to analyze a review and determine whether the "
             f"traveler(s) mentioned in the review have a very specific need in the category: '{category}'. "
             f"Respond strictly with 'yes' if the review indicates they travel with {category}, "
             "or 'no' if not. Your response must be ONE word only, without any explanation or extra text."
         )},
        {"role": "assistant",
         "content": "Understood. I will respond only with 'yes' or 'no', one word."},
        {"role": "user",
         "content": f"Here is the review to analyze:\n\n\"{review_text}\""}
    ]
    
    response = ollama.chat(model=model, messages=messages,options={"temperature": 0})
    answer = response["message"]["content"].strip().lower()
    cleaned = re.sub(r'[^a-z]', '', answer)
    
    return 1 if cleaned == 'yes' else 0

def classify_all_categories(review):
    return [classify_review_ollama(review, category) for category in classes]

# Prediction
with ThreadPoolExecutor(max_workers=NUM_THREAD) as executor:
    y_pred = list(executor.map(classify_all_categories, df["review"]))

# Convertir en matrice numpy si besoin :
import numpy as np
y_pred = np.array(y_pred, dtype=int)

# Metrics
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


Label: handicap
 Accuracy : 0.9550
 Precision: 0.9815
 Recall   : 0.8689
 F1-score : 0.9217

Label: pet
 Accuracy : 0.9400
 Precision: 0.9032
 Recall   : 0.9032
 F1-score : 0.9032

Label: child
 Accuracy : 0.9300
 Precision: 0.8772
 Recall   : 0.8772
 F1-score : 0.8772

Global metrics:
 Micro Precision: 0.9191, Recall: 0.8833, F1: 0.9008
 Macro Precision: 0.9206, Recall: 0.8831, F1: 0.9007


# Evaluation prompt few shot mistral small 7B

In [7]:
def classify_review_ollama(review_text, category, model="mistral"):
    """Classification with Ollama """
    
    if category == "child":
        messages = [
            {"role": "system",
             "content": (
                  "You are a strict family-review classifier. Your task is to analyze a review and determine "
                 "whether the traveler(s) are traveling with children. Especially, you need to determine"
                 " if these children have a high chance to be under 18 years old."
                 "Respond strictly with 'yes' if the review indicates people travelling with children, or 'no' if not. "
                 "ONE word only, no explanations or extra text."
             )},
            {"role": "assistant",
             "content": "Understood. I will respond only with 'yes' or 'no', one word."},
            {"role": "user",
             "content": (
                 "Here are some examples:\n"
                 "Review: \"We traveled with our kids and loved the family-friendly pool.\" -> yes\n"
                 "Review: \"The hotel was great, but we went alone as a couple.\" -> no\n"
                 "Review: \"I got there at 6:30, and a kid that apparently worked there (no id/uniform) was scrambling to set everything up\" -> no\n"
                 "Review: \"My Grand kids loved the pool\" -> yes\n"
                 "Review: \"I travelled to Dakota to see my son graduatation\" -> no\n"
                 "Review: \"This family owned business has a welcoming staff which made us feel right at home\" -> no\n"
                 "Review: \"If I had to ask one thing of Best Western, please replace the mattresses or box springs every time our kids moved at night\" -> yes\n\n"
                 f"Now classify this review:\n\"{review_text}\""
             )}
        ]
        
    elif category == "pet":
        messages = [
            {"role": "system",
             "content": (
                 "You are a strict pet-friendly-review classifier. Your task is to analyze a review and determine "
                 "whether the traveler(s) are traveling with pets. "
                 "Respond strictly with 'yes' if the review indicates they travel with pets, or 'no' if not. "
                 "ONE word only, no explanations or extra text."
             )},
            {"role": "assistant",
             "content": "Understood. I will respond only with 'yes' or 'no', one word."},
            {"role": "user",
             "content": (
                 "Here are some examples:\n"
                 "Review: \"Thanks again Cat!\" -> no\n"
                 "Review: \"I only booked this hotel because it was dog friendly\" -> yes\n"
                 "Review: \"I wanted to see if I could bring my service dog with me but they told me it was impossible at the front desk.\" -> yes\n"
                 "Review: \"The bedsheets were smelling cat urine. Horrible !\" -> no\n"
                 "Review: \"Perfect for travelers with cats or dogs.\" -> yes\n\n"
                 f"Now classify this review:\n\"{review_text}\""
             )}
        ]
        
    elif category == "handicap":
        messages = [
            {"role": "system",
             "content": (
                 "You are a strict business-travel-review classifier. Your task is to analyze a review and determine "
                 "whether the traveler(s) have any type of handicap or if the reviews contains a specific needs"
                 "associated with a disability (transporations, amenities, etc.) "
                 "Respond strictly with 'yes' if the review indicates a handicaped traveler or a special need related to handicap travelling, or 'no' if not. "
                 "ONE word only, no explanations or extra text."
             )},
            {"role": "assistant",
             "content": "Understood. I will respond only with 'yes' or 'no', one word."},
            {"role": "user",
             "content": (
                 "Here are some examples:\n"
                 "Review: \"Plant to go to London in September Need information about Accessible Van in London airport\" -> yes\n"
                 "Review: \"The room was great, big enough to move around in my power chair in both the bedroom and bathroom\" -> yes\n"
                 "Review: \"I would like to sell my wheelchair.please contact me\" -> no\n"
                 "Review: \"It's new digital travel magazine targeted exclusively for travelers with disabilities.\" -> no\n"
                 "Review: \"Nice roll-in shower with a pull-down bench, but the amenities were again too high\" -> yes\n\n"
                 f"Now classify this review:\n\"{review_text}\""
             )}
        ]
        
    else:
        raise ValueError(f"Unknown category: {category}")
    
    # Ici tu peux directement envoyer `messages` à Ollama
    response = ollama.chat(model=model, messages=messages, options={"temperature": 0})
    answer = response["message"]["content"].strip().lower()
    cleaned = re.sub(r'[^a-z]', '', answer)
    
    return 1 if cleaned == 'yes' else 0


def classify_all_categories(review):
    return [classify_review_ollama(review, category) for category in classes]

# Prediction
with ThreadPoolExecutor(max_workers=NUM_THREAD) as executor:
    y_pred = list(executor.map(classify_all_categories, df["review"]))

# Convertir en matrice numpy si besoin :
import numpy as np
y_pred = np.array(y_pred, dtype=int)

# Metrics
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


Label: handicap
 Accuracy : 0.9800
 Precision: 0.9831
 Recall   : 0.9508
 F1-score : 0.9667

Label: pet
 Accuracy : 0.9650
 Precision: 0.9365
 Recall   : 0.9516
 F1-score : 0.9440

Label: child
 Accuracy : 0.9200
 Precision: 0.8475
 Recall   : 0.8772
 F1-score : 0.8621

Global metrics:
 Micro Precision: 0.9227, Recall: 0.9278, F1: 0.9252
 Macro Precision: 0.9223, Recall: 0.9265, F1: 0.9242


## Evluation using Human validation

In [None]:
import pandas as pd
from pathlib import Path

FINAL_PATH = Path("../../data/processed/final")
VALIDATED_PATH = Path("../../data/processed/data_validated")
CATEGORIES = ["child", "pet", "handicap"]

def load_human_labels():
    rows = []
    for cat in CATEGORIES:
        cat_dir = FINAL_PATH / cat
        for file in cat_dir.glob("*.csv"):
            df = pd.read_csv(file)
            if "review" not in df.columns.tolist():
                print(file.name)
            truth = 1 if "good" in file.stem else 0
            df["human_truth"] = truth
            df["category"] = cat
            rows.append(df[["id","review","human_truth","category"]])
    return pd.concat(rows, ignore_index=True)

def load_validated_data():
    dfs = []
    for cat in CATEGORIES:
        for file in (VALIDATED_PATH / cat).glob("*.csv"):
            df = pd.read_csv(file)
            df["category"] = cat
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

human_df = load_human_labels()
validated_df = load_validated_data()

merged = human_df.merge(validated_df, on=["id", "category"], how="left")
merged_unique = merged.drop_duplicates(subset=["id", "review_x","category"])
merged_unique = merged_unique.rename(columns={"review_x": "review"})
merged_unique = merged_unique.drop(columns=["review_y"])

final_cols = [
    "id", "review", "category", "human_truth", "validation_status",
    "llm_child", "llm_pet", "llm_handicap",
    "bert_child", "bert_pet", "bert_handicap",
    "kw_child", "kw_pet", "kw_handicap"
]

final_dataset = merged_unique[final_cols]
final_dataset["review"] = final_dataset["review"].str.replace("\n", " ", regex=False)
final_dataset["review"] = final_dataset["review"].str.replace("\r", " ", regex=False)

print(f"NB REVIEWS : {len(final_dataset)}")

#final_dataset.to_csv("merged_final_dataset.csv", index=False)



validated_data_accessiblego_child_good.csv -> ['original_index', 'review', 'id']

validated_data_accessiblego_child_rejected.csv -> ['original_index', 'review', 'id']

validated_data_activities_reviews_child_good.csv -> ['original_index', 'review', 'id']

validated_data_activities_reviews_child_rejected.csv -> ['original_index', 'review', 'id']

validated_data_airline_reviews_1_child_good.csv -> ['original_index', 'review', 'id']

validated_data_airline_reviews_1_child_rejected.csv -> ['original_index', 'review', 'id']

validated_data_airline_reviews_2_child_good.csv -> ['original_index', 'review', 'id']

validated_data_airline_reviews_2_child_rejected.csv -> ['original_index', 'review', 'id']

validated_data_booking_child_good.csv -> ['original_index', 'review', 'id']

validated_data_booking_child_rejected.csv -> ['original_index', 'review', 'id']

validated_data_european_hotel_reviews_child_good.csv -> ['original_index', 'review', 'id']

validated_data_european_hotel_reviews_child_r

In [4]:
def compute_statistics(df: pd.DataFrame):
    
    stats = {}

    # 1️⃣ Nombre total de reviews
    stats["total_reviews"] = len(df)

    # 2️⃣ Nombre de reviews par catégorie + proportion
    cat_counts = df["category"].value_counts()
    cat_props = df["category"].value_counts(normalize=True)
    stats["reviews_per_category"] = pd.DataFrame({
        "count": cat_counts,
        "proportion": cat_props
    })

    # 3️⃣ Nombre de reviews par validation_status + proportion
    val_counts = df["validation_status"].value_counts()
    val_props = df["validation_status"].value_counts(normalize=True)
    stats["reviews_per_validation_status"] = pd.DataFrame({
        "count": val_counts,
        "proportion": val_props
    })

    # 4️⃣ Nombre de reviews par validation_status pour chaque catégorie + proportion par catégorie
    stats["reviews_per_status_per_category"] = (
        df.groupby(["category", "validation_status"])
          .size()
          .to_frame("count")
          .groupby(level=0)
          .apply(lambda x: x.assign(proportion=x["count"]/x["count"].sum()))
    )

    return stats

In [5]:
def display_statistics(stats: dict):
    print("=== STATISTICS ===\n")
    
    # Nombre total de reviews
    print(f"Total Reviews: {stats['total_reviews']}\n")
    
    # Reviews par catégorie
    print("Reviews per Category:")
    display(stats['reviews_per_category'].sort_index())
    print("\n")
    
    # Reviews par validation_status
    print("Reviews per Validation Status:")
    display(stats['reviews_per_validation_status'].sort_index())
    print("\n")
    
    # Reviews par validation_status par catégorie
    print("Reviews per Validation Status per Category:")
    display(stats['reviews_per_status_per_category'])
    print("\n")

In [16]:
statistics = compute_statistics(final_dataset)
display_statistics(statistics)

=== STATISTICS ===

Total Reviews: 3300

Reviews per Category:


Unnamed: 0_level_0,count,proportion
category,Unnamed: 1_level_1,Unnamed: 2_level_1
child,1575,0.477273
handicap,877,0.265758
pet,848,0.25697




Reviews per Validation Status:


Unnamed: 0_level_0,count,proportion
validation_status,Unnamed: 1_level_1,Unnamed: 2_level_1
agreed,2437,0.738485
disputed,92,0.027879
llm_validated,771,0.233636




Reviews per Validation Status per Category:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,proportion
category,category,validation_status,Unnamed: 3_level_1,Unnamed: 4_level_1
child,child,agreed,1196,0.759365
child,child,disputed,23,0.014603
child,child,llm_validated,356,0.226032
handicap,handicap,agreed,573,0.653364
handicap,handicap,disputed,34,0.038769
handicap,handicap,llm_validated,270,0.307868
pet,pet,agreed,668,0.787736
pet,pet,disputed,35,0.041274
pet,pet,llm_validated,145,0.170991






In [None]:
def compute_pipeline_metrics(df: pd.DataFrame):
    metrics = {}

    total_reviews = len(df)

    # -------------------------
    # 1️⃣ Premier layer : Keywords
    # -------------------------
    kw_correct = (
        ((df["kw_child"] == df["llm_child"]) & (df["human_truth"] == 1)) &
        ((df["kw_pet"] == df["llm_pet"]) & (df["human_truth"] == 1)) &
        ((df["kw_handicap"] == df["llm_handicap"]) & (df["human_truth"] == 1))
    )
    metrics["keywords_precision"] = kw_correct.sum() / total_reviews
    metrics["keywords_coverage"] = total_reviews / total_reviews

    # -------------------------
    # 2️⃣ Second layer : BERT validation
    # -------------------------
    bert_correct = (
        ((df["bert_child"] == df["llm_child"]) & (df["human_truth"] == 1)) &
        ((df["bert_pet"] == df["llm_pet"]) & (df["human_truth"] == 1)) &
        ((df["bert_handicap"] == df["llm_handicap"]) & (df["human_truth"] == 1))
    )
    bert_correct_kw_incorrect = (
        ((df["kw_child"] != df["llm_child"]) & (df["bert_child"] == df["llm_child"]) & (df["human_truth"] == 1)) &
        ((df["kw_pet"] != df["llm_pet"]) & (df["bert_pet"] == df["llm_pet"]) & (df["human_truth"] == 1)) &
        ((df["kw_handicap"] != df["llm_handicap"]) & (df["bert_handicap"] == df["llm_handicap"]) & (df["human_truth"] == 1))
    )
    metrics["bert_layer_precision"] = bert_correct.sum() / total_reviews
    metrics["bert_layer_gain"] = bert_correct_kw_incorrect.sum() / total_reviews
    metrics["bert_layer_coverage"] = total_reviews / total_reviews

    # -------------------------
    # 3️⃣ Third layer : LLM validation
    # -------------------------
    correct = (df["human_truth"] == 1)
    llm_correct = (df["validation_status"].isin(["llm_validated", "disputed"]) & (df["human_truth"] == 1))
    metrics["llm_layer_precision"] = correct.sum() / total_reviews
    metrics["llm_layer_coverage"] = len(df[df["validation_status"].isin(["llm_validated", "disputed"])]) / total_reviews
    metrics['llm_layer_gain'] = llm_correct.sum() / total_reviews

    # LLM préférences quand ce n'est pas "agreed"
    mask = df["validation_status"] != "agreed"
    llm_pref = {}

    llm_pref["agree_kw_percentage"] = (
        ((df["kw_child"] == df["llm_child"]) & mask) &
        ((df["kw_pet"] == df["llm_pet"]) & mask) &
        ((df["kw_handicap"] == df["llm_handicap"]) & mask)
    ).sum() / mask.sum()
    
    llm_pref["agree_bert_percentage"] = (
        ((df["bert_child"] == df["llm_child"]) & mask) &
        ((df["bert_pet"] == df["llm_pet"]) & mask) &
        ((df["bert_handicap"] == df["llm_handicap"]) & mask)
    ).sum() / mask.sum()
    
    llm_pref["disagree_both_percentage"] = (
        mask.sum() - llm_pref["agree_kw_percentage"] * mask.sum() - llm_pref["agree_bert_percentage"] * mask.sum()
    ) / mask.sum()

    metrics["llm_preferences_counts"] = llm_pref

    # -------------------------
    # 4️⃣ Gains pipeline
    # -------------------------
    metrics["pipeline_added_value"] = (bert_correct_kw_incorrect.sum() + llm_correct.sum()) / total_reviews
    metrics["final_pipeline_precision"] = (df["human_truth"] == 1).sum() / total_reviews

    return metrics


def display_pipeline_metrics(metrics: dict):
    print("=== PIPELINE METRICS ===\n")
    print(f"Keywords layer: Layer Precision = {metrics['keywords_precision']:.3%}, Coverage = {metrics['keywords_coverage']:.3%}")
    print(f"BERT layer: Layer Precision = {metrics['bert_layer_precision']:.3%}, Coverage = {metrics['bert_layer_coverage']:.3%}, Layer Gain = {metrics['bert_layer_gain']:.3%}")
    print(f"LLM layer: Layer Precision = {metrics['llm_layer_precision']:.3%}, Coverage = {metrics['llm_layer_coverage']:.3%}, Layer Gain = {metrics['llm_layer_gain']:.3%}")

    print("\nLLM preferences when BERT and Keywords disagree:")
    for k, v in metrics['llm_preferences_counts'].items():
        print(f"  - {k}: {v:.3%}" if isinstance(v, float) else f"  - {k}: {v}")
    
    print(f"\nFinal pipeline: Overall Precision = {metrics['final_pipeline_precision']:.3%}")
    print(f"Final Gain of the pipeline: Gain = {metrics['pipeline_added_value']:.3%}")

In [94]:
display_pipeline_metrics(compute_pipeline_metrics(final_dataset))

=== PIPELINE METRICS ===

Keywords layer: Layer Precision = 63.219%, Coverage = 100.000%
BERT layer: Layer Precision = 51.404%, Coverage = 100.000%, Layer Gain = 0.000%
LLM layer: Layer Precision = 63.904%, Coverage = 23.904%, Layer Gain = 12.500%

LLM preferences when BERT and Keywords disagree:
  - agree_kw_percentage: 90.544%
  - agree_bert_percentage: 6.734%
  - disagree_both_percentage: 2.722%

Final pipeline: Overall Precision = 63.904%
Final Gain of the pipeline: Gain = 12.500%


## Evaluation test

In [7]:
import pandas as pd
from pathlib import Path

# -------------------------
# Paths et catégories
# -------------------------
FINAL_PATH = Path("../../data/processed/final")
VALIDATED_PATH = Path("../../data/processed/data_validated")
CATEGORIES = ["child", "pet", "handicap"]

# -------------------------
# 1️⃣ Charger les labels humains
# -------------------------
def load_human_labels():
    rows = []
    for cat in CATEGORIES:
        cat_dir = FINAL_PATH / cat
        for file in cat_dir.glob("*.csv"):
            df = pd.read_csv(file)

            if "review" not in df.columns:
                print(f"Attention, fichier sans review : {file.name}")

            # human_truth = 1 si "good" dans le nom du fichier
            truth = 1 if "good" in file.stem else 0

            # Colonnes human_truth par catégorie
            for c in CATEGORIES:
                df[f"human_truth_{c}"] = truth if c == cat else 0

            df["category"] = cat

            cols_to_keep = ["id", "review", "category"] + [f"human_truth_{c}" for c in CATEGORIES]
            rows.append(df[cols_to_keep])

    human_df = pd.concat(rows, ignore_index=True)

    # Fusionner les lignes par id pour avoir une seule ligne par review
    human_df = human_df.groupby("id").agg(
        review=("review", "first"),
        human_truth_child=("human_truth_child", "max"),
        human_truth_pet=("human_truth_pet", "max"),
        human_truth_handicap=("human_truth_handicap", "max")
    ).reset_index()

    return human_df

# -------------------------
# 2️⃣ Charger les données validées
# -------------------------
def load_validated_data():
    dfs = []
    for cat in CATEGORIES:
        for file in (VALIDATED_PATH / cat).glob("*.csv"):
            df = pd.read_csv(file)
            df["category"] = cat
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# -------------------------
# 3️⃣ Charger les données
# -------------------------
human_df = load_human_labels()
validated_df = load_validated_data()

# -------------------------
# 4️⃣ Merge avec validated_df
# -------------------------
merged = human_df.merge(validated_df, on="id", how="left")  # merge sur id seulement
# Prendre la colonne review de human_df
if "review_x" in merged.columns:
    merged["review"] = merged["review_x"]
    if "review_y" in merged.columns:
        merged = merged.drop(columns=["review_y"])
    merged = merged.drop(columns=["review_x"])
else:
    raise ValueError("Aucune colonne 'review_x' trouvée dans le merge")

# Nettoyage du texte
merged["review"] = merged["review"].str.replace("\n", " ", regex=False)
merged["review"] = merged["review"].str.replace("\r", " ", regex=False)


# -------------------------
# 5️⃣ Préparer dataset final
# -------------------------
final_cols = [
    "id", "review",
    "human_truth_child", "human_truth_pet", "human_truth_handicap",
    "validation_status",
    "llm_child", "llm_pet", "llm_handicap",
    "bert_child", "bert_pet", "bert_handicap",
    "kw_child", "kw_pet", "kw_handicap"
]

final_dataset = merged[final_cols]

print(f"NB REVIEWS : {len(final_dataset)}")

# -------------------------
# 6️⃣ Fonctions métriques TP/FP/FN/TN
# -------------------------
def compute_confusion(df, pred_cols, truth_col):
    y_true = df[truth_col].astype(int)
    y_pred = df[pred_cols].astype(int).any(axis=1).astype(int)
    
    TP = ((y_pred == 1) & (y_true == 1)).sum()
    FP = ((y_pred == 1) & (y_true == 0)).sum()
    FN = ((y_pred == 0) & (y_true == 1)).sum()
    TN = ((y_pred == 0) & (y_true == 0)).sum()
    total = len(df)
    
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    accuracy = (TP + TN) / total
    f1 = (2 * precision * recall / (precision + recall)) if precision + recall > 0 else 0
    
    return {
        "TP": TP, "FP": FP, "FN": FN, "TN": TN,
        "accuracy": accuracy, "precision": precision,
        "recall": recall, "f1": f1,
        "total": total
    }

def compute_pipeline_metrics(df):
    results = {}
    for cat in CATEGORIES:
        results[cat] = {}
        for layer in ["kw", "bert", "llm"]:
            pred_cols = [f"{layer}_{cat}"]
            truth_col = f"human_truth_{cat}"
            results[cat][layer] = compute_confusion(df, pred_cols, truth_col)
    return results

# -------------------------
# 7️⃣ Calculer et afficher métriques
# -------------------------
metrics = compute_pipeline_metrics(final_dataset)

for cat in CATEGORIES:
    print(f"\n=== Metrics pour catégorie : {cat} ===")
    for layer in ["kw", "bert", "llm"]:
        m = metrics[cat][layer]
        print(f"{layer.upper()}: TP={m['TP']} FP={m['FP']} FN={m['FN']} TN={m['TN']} | "
              f"Acc={m['accuracy']:.3%} Prec={m['precision']:.3%} Recall={m['recall']:.3%} F1={m['f1']:.3%}")


NB REVIEWS : 4217

=== Metrics pour catégorie : child ===
KW: TP=1261 FP=1234 FN=119 TN=1603 | Acc=67.916% Prec=50.541% Recall=91.377% F1=65.084%
BERT: TP=1033 FP=744 FN=347 TN=2093 | Acc=74.129% Prec=58.132% Recall=74.855% F1=65.442%
LLM: TP=1251 FP=1167 FN=129 TN=1670 | Acc=69.267% Prec=51.737% Recall=90.652% F1=65.877%

=== Metrics pour catégorie : pet ===
KW: TP=560 FP=458 FN=106 TN=3093 | Acc=86.626% Prec=55.010% Recall=84.084% F1=66.508%
BERT: TP=461 FP=353 FN=205 TN=3198 | Acc=86.768% Prec=56.634% Recall=69.219% F1=62.297%
LLM: TP=560 FP=432 FN=106 TN=3119 | Acc=87.242% Prec=56.452% Recall=84.084% F1=67.551%

=== Metrics pour catégorie : handicap ===
KW: TP=742 FP=554 FN=192 TN=2729 | Acc=82.310% Prec=57.253% Recall=79.443% F1=66.547%
BERT: TP=543 FP=385 FN=391 TN=2898 | Acc=81.598% Prec=58.513% Recall=58.137% F1=58.324%
LLM: TP=745 FP=550 FN=189 TN=2733 | Acc=82.476% Prec=57.529% Recall=79.764% F1=66.846%


In [8]:
def compute_pipeline_metrics_v2(df: pd.DataFrame):
    """
    Calcul des métriques du pipeline pour chaque layer : keywords, BERT, LLM
    en prenant en compte human_truth_* par catégorie.
    """
    metrics = {}
    total_reviews = len(df)

    # -------------------------
    # 1️⃣ Layer Keywords
    # -------------------------
    kw_correct = (
        ((df["kw_child"] == df["llm_child"]) & (df["human_truth_child"] == 1)) &
        ((df["kw_pet"] == df["llm_pet"]) & (df["human_truth_pet"] == 1)) &
        ((df["kw_handicap"] == df["llm_handicap"]) & (df["human_truth_handicap"] == 1))
    )
    metrics["keywords_precision"] = kw_correct.sum() / total_reviews
    metrics["keywords_coverage"] = total_reviews / total_reviews  # toutes les reviews passent par ce layer
    metrics["keywords_gain"] = kw_correct.sum() / total_reviews  # pseudo gain = nombre correct / total

    # -------------------------
    # 2️⃣ Layer BERT
    # -------------------------
    bert_correct = (
        ((df["bert_child"] == df["llm_child"]) & (df["human_truth_child"] == 1)) &
        ((df["bert_pet"] == df["llm_pet"]) & (df["human_truth_pet"] == 1)) &
        ((df["bert_handicap"] == df["llm_handicap"]) & (df["human_truth_handicap"] == 1))
    )

    bert_correct_kw_incorrect = (
        ((df["kw_child"] != df["llm_child"]) & (df["bert_child"] == df["llm_child"]) & (df["human_truth_child"] == 1)) &
        ((df["kw_pet"] != df["llm_pet"]) & (df["bert_pet"] == df["llm_pet"]) & (df["human_truth_pet"] == 1)) &
        ((df["kw_handicap"] != df["llm_handicap"]) & (df["bert_handicap"] == df["llm_handicap"]) & (df["human_truth_handicap"] == 1))
    )

    metrics["bert_precision"] = bert_correct.sum() / total_reviews
    metrics["bert_coverage"] = total_reviews / total_reviews
    metrics["bert_gain"] = bert_correct_kw_incorrect.sum() / total_reviews

    # -------------------------
    # 3️⃣ Layer LLM
    # -------------------------
    llm_mask = df["validation_status"].isin(["llm_validated", "disputed"])
    llm_correct = (
        ((df["human_truth_child"] == 1) & llm_mask) &
        ((df["human_truth_pet"] == 1) & llm_mask) &
        ((df["human_truth_handicap"] == 1) & llm_mask)
    )

    metrics["llm_precision"] = llm_correct.sum() / total_reviews
    metrics["llm_coverage"] = llm_mask.sum() / total_reviews
    metrics["llm_gain"] = llm_correct.sum() / total_reviews

    # -------------------------
    # 4️⃣ Gains pipeline global
    # -------------------------
    metrics["pipeline_added_value"] = (bert_correct_kw_incorrect.sum() + llm_correct.sum()) / total_reviews
    metrics["final_precision"] = (
        ((df["human_truth_child"] == 1) | (df["human_truth_pet"] == 1) | (df["human_truth_handicap"] == 1)).sum()
        / total_reviews
    )

    return metrics


def display_pipeline_metrics_v2(metrics: dict):
    print("=== PIPELINE METRICS V2 ===\n")
    print(f"Keywords layer: Precision = {metrics['keywords_precision']:.3%}, Coverage = {metrics['keywords_coverage']:.3%}, Gain = {metrics['keywords_gain']:.3%}")
    print(f"BERT layer: Precision = {metrics['bert_precision']:.3%}, Coverage = {metrics['bert_coverage']:.3%}, Gain = {metrics['bert_gain']:.3%}")
    print(f"LLM layer: Precision = {metrics['llm_precision']:.3%}, Coverage = {metrics['llm_coverage']:.3%}, Gain = {metrics['llm_gain']:.3%}")
    print(f"\nPipeline added value (gain total) = {metrics['pipeline_added_value']:.3%}")
    print(f"Final pipeline pseudo-precision = {metrics['final_precision']:.3%}")


In [10]:
metrics = compute_pipeline_metrics_v2(final_dataset)
display_pipeline_metrics_v2(metrics)


=== PIPELINE METRICS V2 ===

Keywords layer: Precision = 0.166%, Coverage = 100.000%, Gain = 0.166%
BERT layer: Precision = 0.119%, Coverage = 100.000%, Gain = 0.000%
LLM layer: Precision = 0.047%, Coverage = 28.338%, Gain = 0.047%

Pipeline added value (gain total) = 0.047%
Final pipeline pseudo-precision = 65.900%


In [14]:
import pandas as pd
from pathlib import Path

# -------------------------
# Paths et catégories
# -------------------------
FINAL_PATH = Path("../../data/processed/final")
VALIDATED_PATH = Path("../../data/processed/data_validated")
CATEGORIES = ["child", "pet", "handicap"]

# -------------------------
# 1️⃣ Charger les labels humains
# -------------------------
def load_human_labels():
    rows = []
    for cat in CATEGORIES:
        cat_dir = FINAL_PATH / cat
        for file in cat_dir.glob("*.csv"):
            df = pd.read_csv(file)

            if "review" not in df.columns:
                print(f"Attention, fichier sans review : {file.name}")
                continue

            truth = 1 if "good" in file.stem else 0

            # human_truth par catégorie
            for c in CATEGORIES:
                df[f"human_truth_{c}"] = truth if c == cat else 0

            df["category"] = cat
            cols_to_keep = ["id", "review", "category"] + [f"human_truth_{c}" for c in CATEGORIES]
            rows.append(df[cols_to_keep])

    human_df = pd.concat(rows, ignore_index=True)

    # Fusion par id pour avoir une seule ligne par review
    human_df = human_df.groupby("id").agg(
        review=("review", "first"),
        human_truth_child=("human_truth_child", "max"),
        human_truth_pet=("human_truth_pet", "max"),
        human_truth_handicap=("human_truth_handicap", "max")
    ).reset_index()

    return human_df

# -------------------------
# 2️⃣ Charger les données validées
# -------------------------
def load_validated_data():
    dfs = []
    for cat in CATEGORIES:
        for file in (VALIDATED_PATH / cat).glob("*.csv"):
            df = pd.read_csv(file)
            df["category"] = cat
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# -------------------------
# 3️⃣ Charger les datasets
# -------------------------
human_df = load_human_labels()
validated_df = load_validated_data()

# Merge sur id
merged = human_df.merge(validated_df, on="id", how="left")

# Gestion des colonnes review
if "review_x" in merged.columns:
    merged["review"] = merged["review_x"]
    merged = merged.drop(columns=["review_x"], errors="ignore")
    if "review_y" in merged.columns:
        merged = merged.drop(columns=["review_y"], errors="ignore")
elif "review" not in merged.columns:
    raise ValueError("La colonne 'review' est absente après merge")

# Nettoyage des retours à la ligne
merged["review"] = merged["review"].str.replace("\n", " ", regex=False)
merged["review"] = merged["review"].str.replace("\r", " ", regex=False)

# Colonnes finales
final_cols = [
    "id", "review",
    "human_truth_child", "human_truth_pet", "human_truth_handicap",
    "validation_status",
    "llm_child", "llm_pet", "llm_handicap",
    "bert_child", "bert_pet", "bert_handicap",
    "kw_child", "kw_pet", "kw_handicap"
]

final_dataset = merged[final_cols]

print(f"NB REVIEWS : {len(final_dataset)}")

# -------------------------
# 4️⃣ Fonctions métriques par catégorie
# -------------------------
def compute_layer_metrics(df, layer, previous_layer=None):
    """
    Calcul Precision, Coverage, Gain par catégorie pour un layer.
    previous_layer: dict avec booléens TP/FP du layer précédent pour calcul du gain
    """
    results = {}
    total_reviews = len(df)

    for cat in CATEGORIES:
        pred = df[f"{layer}_{cat}"]
        truth = df[f"human_truth_{cat}"]

        TP = ((pred == 1) & (truth == 1)).sum()
        FP = ((pred == 1) & (truth == 0)).sum()
        FN = ((pred == 0) & (truth == 1)).sum()
        TN = ((pred == 0) & (truth == 0)).sum()

        precision = TP / (TP + FP) if TP + FP > 0 else 0
        coverage = total_reviews / total_reviews  # toutes les reviews observées
        if previous_layer is None:
            gain = 0  # pas de gain pour le premier layer
        else:
            # gain = corrections apportées par ce layer par rapport au précédent
            prev_pred = previous_layer[cat]["pred"]
            gain = ((prev_pred == 0) & (pred == 1) & (truth == 1)).sum() / total_reviews

        results[cat] = {
            "TP": TP, "FP": FP, "FN": FN, "TN": TN,
            "precision": precision,
            "coverage": coverage,
            "gain": gain,
            "pred": pred  # pour calcul du gain au layer suivant
        }
    return results

# -------------------------
# 5️⃣ Calcul pipeline metrics
# -------------------------
def compute_pipeline_metrics(df):
    metrics = {}

    # Keywords
    metrics["keywords"] = compute_layer_metrics(df, "kw")
    # BERT
    metrics["bert"] = compute_layer_metrics(df, "bert", previous_layer=metrics["keywords"])
    # LLM (on ne prend que les reviews validées ou disputed)
    df_llm = df[df["validation_status"].isin(["llm_validated", "disputed"])]
    metrics["llm"] = compute_layer_metrics(df_llm, "llm", previous_layer=metrics["bert"])

    return metrics

# -------------------------
# 6️⃣ Affichage
# -------------------------
def display_pipeline_metrics(metrics):
    print("=== PIPELINE METRICS ===")
    for layer in ["keywords", "bert", "llm"]:
        print(f"\nLayer: {layer.upper()}")
        for cat in CATEGORIES:
            m = metrics[layer][cat]
            print(f"  {cat}: Precision={m['precision']:.2%}, Coverage={m['coverage']:.2%}, Gain={m['gain']:.2%} | TP={m['TP']} FP={m['FP']} FN={m['FN']} TN={m['TN']}")

# -------------------------
# 7️⃣ Exécution
# -------------------------
metrics = compute_pipeline_metrics(final_dataset)
display_pipeline_metrics(metrics)


NB REVIEWS : 4217
=== PIPELINE METRICS ===

Layer: KEYWORDS
  child: Precision=50.54%, Coverage=100.00%, Gain=0.00% | TP=1261 FP=1234 FN=119 TN=1603
  pet: Precision=55.01%, Coverage=100.00%, Gain=0.00% | TP=560 FP=458 FN=106 TN=3093
  handicap: Precision=57.25%, Coverage=100.00%, Gain=0.00% | TP=742 FP=554 FN=192 TN=2729

Layer: BERT
  child: Precision=58.13%, Coverage=100.00%, Gain=0.02% | TP=1033 FP=744 FN=347 TN=2093
  pet: Precision=56.63%, Coverage=100.00%, Gain=0.00% | TP=461 FP=353 FN=205 TN=3198
  handicap: Precision=58.51%, Coverage=100.00%, Gain=0.14% | TP=543 FP=385 FN=391 TN=2898

Layer: LLM
  child: Precision=33.03%, Coverage=100.00%, Gain=18.33% | TP=257 FP=521 FN=33 TN=384
  pet: Precision=53.79%, Coverage=100.00%, Gain=8.28% | TP=149 FP=128 FN=38 TN=880
  handicap: Precision=53.60%, Coverage=100.00%, Gain=16.90% | TP=298 FP=258 FN=44 TN=595


In [18]:
import pandas as pd
from pathlib import Path

# -------------------------
# Paths et catégories
# -------------------------
FINAL_PATH = Path("../../data/processed/final")
VALIDATED_PATH = Path("../../data/processed/data_validated")
CATEGORIES = ["child", "pet", "handicap"]

# -------------------------
# 1️⃣ Charger les labels humains
# -------------------------
def load_human_labels():
    rows = []
    for cat in CATEGORIES:
        cat_dir = FINAL_PATH / cat
        for file in cat_dir.glob("*.csv"):
            df = pd.read_csv(file)

            if "review" not in df.columns:
                print(f"Attention, fichier sans review : {file.name}")
                continue

            truth = 1 if "good" in file.stem else 0

            # human_truth par catégorie
            for c in CATEGORIES:
                df[f"human_truth_{c}"] = truth if c == cat else 0

            df["category"] = cat
            cols_to_keep = ["id", "review", "category"] + [f"human_truth_{c}" for c in CATEGORIES]
            rows.append(df[cols_to_keep])

    human_df = pd.concat(rows, ignore_index=True)

    # Fusion par id pour avoir une seule ligne par review
    human_df = human_df.groupby("id").agg(
        review=("review", "first"),
        human_truth_child=("human_truth_child", "max"),
        human_truth_pet=("human_truth_pet", "max"),
        human_truth_handicap=("human_truth_handicap", "max")
    ).reset_index()

    return human_df

# -------------------------
# 2️⃣ Charger les données validées
# -------------------------
def load_validated_data():
    dfs = []
    for cat in CATEGORIES:
        for file in (VALIDATED_PATH / cat).glob("*.csv"):
            df = pd.read_csv(file)
            df["category"] = cat
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# -------------------------
# 3️⃣ Charger et fusionner les datasets
# -------------------------
human_df = load_human_labels()
validated_df = load_validated_data()

# Merge sur id
merged = human_df.merge(validated_df, on="id", how="left")

# Gestion des colonnes review
if "review_x" in merged.columns:
    merged["review"] = merged["review_x"]
    merged = merged.drop(columns=["review_x"], errors="ignore")
    if "review_y" in merged.columns:
        merged = merged.drop(columns=["review_y"], errors="ignore")
elif "review" not in merged.columns:
    raise ValueError("La colonne 'review' est absente après merge")

# Nettoyage des retours à la ligne
merged["review"] = merged["review"].str.replace("\n", " ", regex=False)
merged["review"] = merged["review"].str.replace("\r", " ", regex=False)

# Colonnes finales
final_cols = [
    "id", "review",
    "human_truth_child", "human_truth_pet", "human_truth_handicap",
    "validation_status",
    "llm_child", "llm_pet", "llm_handicap",
    "bert_child", "bert_pet", "bert_handicap",
    "kw_child", "kw_pet", "kw_handicap"
]

final_dataset = merged[final_cols]

print(f"NB REVIEWS : {len(final_dataset)}")

# -------------------------
# 4️⃣ Calcul des métriques par layer
# -------------------------
def compute_pipeline_metrics(df: pd.DataFrame):
    metrics = {}
    total_reviews = len(df)

    # Keywords layer
    kw_correct = (
        ((df["kw_child"] == df["llm_child"]) & (df["human_truth_child"] == 1)) &
        ((df["kw_pet"] == df["llm_pet"]) & (df["human_truth_pet"] == 1)) &
        ((df["kw_handicap"] == df["llm_handicap"]) & (df["human_truth_handicap"] == 1))
    )
    metrics["keywords_precision"] = kw_correct.sum() / total_reviews
    metrics["keywords_coverage"] = total_reviews / total_reviews
    metrics["keywords_gain"] = 0  # pas de gain pour le premier layer

    # BERT layer
    bert_correct = (
        ((df["bert_child"] == df["llm_child"]) & (df["human_truth_child"] == 1)) &
        ((df["bert_pet"] == df["llm_pet"]) & (df["human_truth_pet"] == 1)) &
        ((df["bert_handicap"] == df["llm_handicap"]) & (df["human_truth_handicap"] == 1))
    )
    bert_gain = (
        ((df["kw_child"] != df["llm_child"]) & (df["bert_child"] == df["llm_child"]) & (df["human_truth_child"] == 1)) &
        ((df["kw_pet"] != df["llm_pet"]) & (df["bert_pet"] == df["llm_pet"]) & (df["human_truth_pet"] == 1)) &
        ((df["kw_handicap"] != df["llm_handicap"]) & (df["bert_handicap"] == df["llm_handicap"]) & (df["human_truth_handicap"] == 1))
    )
    metrics["bert_layer_precision"] = bert_correct.sum() / total_reviews
    metrics["bert_layer_coverage"] = total_reviews / total_reviews
    metrics["bert_layer_gain"] = bert_gain.sum() / total_reviews

    # LLM layer
    mask_llm = df["validation_status"].isin(["llm_validated", "disputed"])
    llm_correct = mask_llm & (
        ((df["llm_child"] == 1) & (df["human_truth_child"] == 1)) |
        ((df["llm_pet"] == 1) & (df["human_truth_pet"] == 1)) |
        ((df["llm_handicap"] == 1) & (df["human_truth_handicap"] == 1))
    )
    metrics["llm_layer_precision"] = llm_correct.sum() / mask_llm.sum() if mask_llm.sum() > 0 else 0
    metrics["llm_layer_coverage"] = mask_llm.sum() / total_reviews
    metrics['llm_layer_gain'] = llm_correct.sum() / total_reviews

    # LLM préférences quand ce n'est pas "agreed"
    mask_disagree = df["validation_status"] != "agreed"
    llm_pref = {}

    llm_pref["agree_kw_percentage"] = (
        ((df["kw_child"] == df["llm_child"]) & mask_disagree) &
        ((df["kw_pet"] == df["llm_pet"]) & mask_disagree) &
        ((df["kw_handicap"] == df["llm_handicap"]) & mask_disagree)
    ).sum() / mask_disagree.sum() if mask_disagree.sum() > 0 else 0

    llm_pref["agree_bert_percentage"] = (
        ((df["bert_child"] == df["llm_child"]) & mask_disagree) &
        ((df["bert_pet"] == df["llm_pet"]) & mask_disagree) &
        ((df["bert_handicap"] == df["llm_handicap"]) & mask_disagree)
    ).sum() / mask_disagree.sum() if mask_disagree.sum() > 0 else 0

    llm_pref["disagree_both_percentage"] = (
        1 - llm_pref["agree_kw_percentage"] - llm_pref["agree_bert_percentage"]
    )

    metrics["llm_preferences_counts"] = llm_pref

    # Gains pipeline
    metrics["pipeline_added_value"] = metrics["bert_layer_gain"] + metrics['llm_layer_gain']
    metrics["final_pipeline_precision"] = (
        ((df["human_truth_child"] == 1) | (df["human_truth_pet"] == 1) | (df["human_truth_handicap"] == 1)).sum()
        / total_reviews
    )

    return metrics

# -------------------------
# 5️⃣ Affichage des métriques
# -------------------------
def display_pipeline_metrics(metrics: dict):
    print("=== PIPELINE METRICS ===\n")
    print(f"Keywords layer: Precision = {metrics['keywords_precision']:.3%}, "
          f"Coverage = {metrics['keywords_coverage']:.3%}, Gain = {metrics['keywords_gain']:.3%}")
    print(f"BERT layer: Precision = {metrics['bert_layer_precision']:.3%}, "
          f"Coverage = {metrics['bert_layer_coverage']:.3%}, Gain = {metrics['bert_layer_gain']:.3%}")
    print(f"LLM layer: Precision = {metrics['llm_layer_precision']:.3%}, "
          f"Coverage = {metrics['llm_layer_coverage']:.3%}, Gain = {metrics['llm_layer_gain']:.3%}")

    print("\nLLM preferences when BERT and Keywords disagree:")
    for k, v in metrics['llm_preferences_counts'].items():
        print(f"  - {k}: {v:.3%}" if isinstance(v, float) else f"  - {k}: {v}")

    print(f"\nFinal pipeline pseudo-precision = {metrics['final_pipeline_precision']:.3%}")
    print(f"Total pipeline added value (gain) = {metrics['pipeline_added_value']:.3%}")

# -------------------------
# 6️⃣ Exécution
# -------------------------
metrics = compute_pipeline_metrics(final_dataset)
display_pipeline_metrics(metrics)


NB REVIEWS : 4217
=== PIPELINE METRICS ===

Keywords layer: Precision = 0.166%, Coverage = 100.000%, Gain = 0.000%
BERT layer: Precision = 0.119%, Coverage = 100.000%, Gain = 0.000%
LLM layer: Precision = 54.393%, Coverage = 28.338%, Gain = 15.414%

LLM preferences when BERT and Keywords disagree:
  - agree_kw_percentage: 89.456%
  - agree_bert_percentage: 7.029%
  - disagree_both_percentage: 3.515%

Final pipeline pseudo-precision = 65.900%
Total pipeline added value (gain) = 15.414%
