# Evaluation classification methods

This notebook aims to evaluate on the same 200 labeeled reviews the 3 following methods of classification used ithin our pipelines :

- Keywords extractions
- BERT model finetunned
- LLM Mistral Small 7B

In [29]:
import torch
import re 
import ollama
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score,accuracy_score
import pandas as pd
from transformers import BertTokenizer, BertModel
import logging
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor
import os

In [30]:
# Simple loger for pipeline execution
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Filterign HTTP logging
class HttpStatusFilter(logging.Filter):
    def filter(self, record):
        message = record.getMessage()
        if 'HTTP/1.1 200' not in message:
            record.levelname = "WARNING"
            record.levelno = logging.WARNING
        return 'HTTP/1.1 200' not in message
    
logging.getLogger("httpx").addFilter(HttpStatusFilter())

In [31]:
# Global variables
load_dotenv(dotenv_path="../../.env")
NUM_THREAD = int(os.environ.get("NUM_THREADS"))
logger.info(f"NUM_THREAD fixed to {NUM_THREAD}")

2025-11-24 20:30:10,452 - INFO - NUM_THREAD fixed to 8


In [20]:
# Categories
classes = ["handicap", "pet", "child"]

# Test data loading
logger.info("Loading test data...")
df = pd.read_csv("../../data/original/fine_tunning/data_test.csv")
y_true = df[classes].values.astype(int)

2025-11-24 18:43:43,574 - INFO - Loading test data...


### Evaluation of Keyword extraction

In [33]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- Load datasets ---
df_truth = df
df_kw = pd.read_csv("../../data/processed/data_categorized/key_words_data_test.csv")

classes = ["handicap", "pet", "child"]

# Initialize prediciton at 0
df_pred = pd.DataFrame(0, index=df_truth.index, columns=classes)
df_pred["id"] = df_truth["id"]

# Prediction
# Each row in df_kw corresponds to one detected keyword for a review
# We can have multiple rows for the same review if multiple keywords/categories
for idx, row in df_kw.iterrows():
    review_id = row["id"]
    cat = str(row["category"]).strip().lower()
    if cat in classes:
        # Find the corresponding index in df_pred
        pred_idx = df_pred.index[df_pred["id"] == review_id].tolist()
        if pred_idx:
            df_pred.at[pred_idx[0], cat] = 1

# Extract numpy arrays
y_true = df_truth[classes].values
y_pred = df_pred[classes].values

# Compute metrics
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")

# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


Label: handicap
 Accuracy : 0.9600
 Precision: 1.0000
 Recall   : 0.8689
 F1-score : 0.9298

Label: pet
 Accuracy : 0.9500
 Precision: 0.8732
 Recall   : 0.9841
 F1-score : 0.9254

Label: child
 Accuracy : 0.9550
 Precision: 0.8615
 Recall   : 1.0000
 F1-score : 0.9256

Global metrics:
 Micro Precision: 0.9048, Recall: 0.9500, F1: 0.9268
 Macro Precision: 0.9116, Recall: 0.9510, F1: 0.9269


### Evaluation of BERT model

In [13]:
# Parameters
BERT_PATH = "../../models/bert-base-uncased"
TOKENIZER_PATH = "../bert/bert_tokenizer_pt"
MODEL_WEIGHTS = "../bert/best_weights.pth"
MAX_SEQ_LEN = 128
threshold = 0.95
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model definition
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return self.sigmoid(x)


# Load model and tokenizer
logger.info("Loading tokenizer...")
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)

logger.info("Loading model...")
model = BertMultiLabelClassifier(n_classes=len(classes))
model.load_state_dict(torch.load(MODEL_WEIGHTS, map_location=device))
model.to(device)
model.eval()

# Encodign function
def encode_batch(sentences):
    encoded = tokenizer(
        list(sentences),
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_tensors="pt"
    )
    return encoded["input_ids"], encoded["attention_mask"]


# Prediction
logger.info("Predicting...")
input_ids, attention_mask = encode_batch(df["review"])

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

with torch.no_grad():
    pred = model(input_ids=input_ids, attention_mask=attention_mask).cpu().numpy()

y_pred_bin = (pred > threshold).astype(int)


# Metrics
logger.info("Metrics multilabel")

for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred_bin[:, i])
    prec = precision_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred_bin, average="micro", zero_division=0)
rec_micro = recall_score(y_true, y_pred_bin, average="micro", zero_division=0)
f1_micro = f1_score(y_true, y_pred_bin, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred_bin, average="macro", zero_division=0)
rec_macro = recall_score(y_true, y_pred_bin, average="macro", zero_division=0)
f1_macro = f1_score(y_true, y_pred_bin, average="macro", zero_division=0)

logger.info("Global metrics")
print(f"Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f"Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


2025-11-24 18:05:10,474 - INFO - Loading tokenizer...
2025-11-24 18:05:10,512 - INFO - Loading model...
2025-11-24 18:05:12,088 - INFO - Predicting...
2025-11-24 18:05:14,066 - INFO - Metrics multilabel
2025-11-24 18:05:14,095 - INFO - Global metrics


Label: handicap
Accuracy : 0.8600
Precision: 0.9714
Recall   : 0.5574
F1-score : 0.7083

Label: pet
Accuracy : 0.9300
Precision: 0.8551
Recall   : 0.9365
F1-score : 0.8939

Label: child
Accuracy : 0.9300
Precision: 0.8500
Recall   : 0.9107
F1-score : 0.8793

Micro Precision: 0.8780, Recall: 0.8000, F1: 0.8372
Macro Precision: 0.8922, Recall: 0.8015, F1: 0.8272


### Evaluation of LLM Mistral Small 7B

In [16]:
def classify_review_ollama(review_text, category, model="mistral"):
    """Classification via Ollama"""
    messages = [
        {"role": "system",
         "content": (
             "You are a strict classifier. Your task is to analyze a review and determine whether the "
             f"traveler(s) mentioned in the review have a very specific need in the category: '{category}'. "
             f"Respond strictly with 'yes' if the review indicates they travel with {category}, "
             "or 'no' if not. Your response must be ONE word only, without any explanation or extra text."
         )},
        {"role": "assistant",
         "content": "Understood. I will respond only with 'yes' or 'no', one word."},
        {"role": "user",
         "content": f"Here is the review to analyze:\n\n\"{review_text}\""}
    ]
    
    response = ollama.chat(model=model, messages=messages,options={"temperature": 0})
    answer = response["message"]["content"].strip().lower()
    cleaned = re.sub(r'[^a-z]', '', answer)
    
    return 1 if cleaned == 'yes' else 0

def classify_all_categories(review):
    return [classify_review_ollama(review, category) for category in classes]

# Prediction
with ThreadPoolExecutor(max_workers=NUM_THREAD) as executor:
    y_pred = list(executor.map(classify_all_categories, df["review"]))

# Convertir en matrice numpy si besoin :
import numpy as np
y_pred = np.array(y_pred, dtype=int)

# Metrics
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


Label: handicap
 Accuracy : 0.9550
 Precision: 0.9815
 Recall   : 0.8689
 F1-score : 0.9217

Label: pet
 Accuracy : 0.9350
 Precision: 0.8906
 Recall   : 0.9048
 F1-score : 0.8976

Label: child
 Accuracy : 0.9250
 Precision: 0.8596
 Recall   : 0.8750
 F1-score : 0.8673

Global metrics:
 Micro Precision: 0.9086, Recall: 0.8833, F1: 0.8958
 Macro Precision: 0.9106, Recall: 0.8829, F1: 0.8955
