# Pipeline Validation

We created a pipeline that combines the results of three methods to have a "cross validation" on our reveiws classifications:

- the result of the keywords extractionn that classified a review in the theme related to the keyword
- the result of a finetunned BERT model on the review enabling better classification
- the result of a small LLM on the review enabling better context understanding

If a review gets classified the same way by each of these 3 methods it is considered valid. 
Either way, it has to be submitted to human validation

In [178]:
import polars as pl
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import ollama
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from pathlib import Path
from tqdm import tqdm
import os
import logging
from dotenv import load_dotenv
import torch.nn as nn
import numpy as np


In [None]:
# Simple loger for pipeline execution
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [180]:
# Global variables
load_dotenv(dotenv_path="../.env")
NUM_THREAD = int(os.environ.get("NUM_THREADS"))
logger.info(f"NUM_THREAD fixed to {NUM_THREAD}")

2025-11-19 21:18:49,323 - INFO - NUM_THREAD fixed to 12


In [191]:
# Configuration
CATEGORIES = ['handicap', 'pet', 'child']
MAX_LENGTH = 256
BATCH_SIZE = 32
NUM_THREADS = 4
BERT_PATH = "../models/bert-base-uncased"
TOKENIZER_PATH = "bert/bert_tokenizer_pt"
MODEL_WEIGHTS = "bert/best_weights.pth"
THRESHOLD = 0.95


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Utilisation du device: {device}")

# Création des dossiers de sortie
Path("../data/processed/data_validated/good").mkdir(parents=True, exist_ok=True)
Path("../data/processed/data_validated/rejected").mkdir(parents=True, exist_ok=True)

Utilisation du device: cuda


In [192]:
# Utile ?

class ReviewDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'idx': idx
        }

In [193]:
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return self.sigmoid(x)

In [194]:
def split_review_chunks(review_text, max_length=128):
    """Divise une review en chunks de max_length tokens"""
    words = review_text.split()
    chunks = []
    current_chunk = []
    
    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk).split()) >= max_length - 20:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks if chunks else [review_text]


def predict_bert_chunks(reviews, model, tokenizer, threshold=0.95, batch_size=32):
    """Prédit les catégories pour une liste de reviews avec chunking"""
    model.eval()
    all_predictions = []
    
    with torch.no_grad():
        for i in range(0, len(reviews), batch_size):
            batch_reviews = reviews[i:i + batch_size]
            
            # Encoder le batch
            encoded = tokenizer(
                batch_reviews,
                padding="max_length",
                truncation=True,
                max_length=MAX_LENGTH,
                return_tensors="pt"
            )
            
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)
            
            # Prédiction
            pred = model(input_ids=input_ids, attention_mask=attention_mask).cpu().numpy()
            pred_bin = (pred > threshold).astype(int)
            
            all_predictions.extend(pred_bin.tolist())
    
    return all_predictions


def classify_review_ollama(review_text, category, model="mistral"):
    """Classification via Ollama"""
    messages = [
        {"role": "system",
         "content": (
             "You are a strict classifier. Your task is to analyze a review and determine whether the "
             f"traveler(s) mentioned in the review have a very specific need in the category: '{category}'. "
             f"Respond strictly with 'yes' if the review indicates they travel with {category}, "
             "or 'no' if not. Your response must be ONE word only, without any explanation or extra text."
         )},
        {"role": "assistant",
         "content": "Understood. I will respond only with 'yes' or 'no', one word."},
        {"role": "user",
         "content": f"Here is the review to analyze:\n\n\"{review_text}\""}
    ]
    
    response = ollama.chat(model=model, messages=messages)
    answer = response["message"]["content"].strip().lower()
    cleaned = re.sub(r'[^a-z]', '', answer)
    
    return 1 if cleaned == 'yes' else 0

In [195]:
def main():
    # 1. Chargement des données
    print("Chargement des données...")
    keywords_df = pl.read_csv("../data/processed/data_categorized/key_words_data_accessiblego.csv")
    original_df = pl.read_csv("../data/original/dataset/data_accessiblego.csv")
    
    # 2. Fusion pour récupérer les reviews originales
    keywords_df = keywords_df.rename({"review": "kw_review"})
    original_reviews = original_df.select(["id", "review"])
    df = keywords_df.join(original_reviews, on="id", how="left")
    df = df.group_by('id').agg(
        pl.col('review').first(), 
        pl.col('category').cast(pl.Utf8).str.join(delimiter=' ')
    )

    # 3. Chargement du modèle BERT
    print("Chargement du tokenizer...")
    tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)
    
    print("Chargement du modèle BERT...")
    model = BertMultiLabelClassifier(n_classes=len(CATEGORIES))
    model.load_state_dict(torch.load(MODEL_WEIGHTS, map_location=device))
    model.to(device)
    model.eval()
    
    # 4. Prédiction BERT sur chunks
    print("Prédiction BERT avec chunking...")
    bert_predictions = []
    
    for review in tqdm(df['review'], desc="Traitement des reviews"):
        chunks = split_review_chunks(str(review), MAX_LENGTH)
        chunk_preds = predict_bert_chunks(chunks, model, tokenizer, THRESHOLD, BATCH_SIZE)
        
        # Agrégation: si un chunk est positif, toute la review l'est
        final_pred = [0, 0, 0]
        for pred in chunk_preds:
            for i in range(3):
                if pred[i]:
                    final_pred[i] = 1
        
        bert_predictions.append(final_pred)
    
    # 5. Conversion des catégories keywords en format binaire
    keyword_preds = []
    for category in df['category']:
        pred = [0, 0, 0]
        if 'handicap' in str(category).lower():
            pred[0] = 1
        if 'pet' in str(category).lower():
            pred[1] = 1
        if 'child' in str(category).lower():
            pred[2] = 1
        keyword_preds.append(pred)
    
    # 6. Ajout des prédictions au DataFrame
    df = df.with_columns([
        pl.Series("bert_handicap", [p[0] for p in bert_predictions]),
        pl.Series("bert_pet", [p[1] for p in bert_predictions]),
        pl.Series("bert_child", [p[2] for p in bert_predictions]),
        pl.Series("kw_handicap", [p[0] for p in keyword_preds]),
        pl.Series("kw_pet", [p[1] for p in keyword_preds]),
        pl.Series("kw_child", [p[2] for p in keyword_preds])
    ])
    
    # 7. Comparaison et filtrage
    print("Comparaison des prédictions...")
    validated_rows = []
    disputed_rows = []
    
    rows = iter(df.iter_rows(named=True))
    for row in tqdm(rows, total=len(df), desc="Validation"):
        bert_preds = [row['bert_handicap'], row['bert_pet'], row['bert_child']]
        kw_preds = [row['kw_handicap'], row['kw_pet'], row['kw_child']]
        
        if bert_preds == kw_preds:
            # Accord parfait
            validated_rows.append({
                **row,
                'validation_status': 'agreed',
                'llm_handicap': None,
                'llm_pet': None,
                'llm_child': None
            })
        else:
            # Désaccord: on demande au LLM
            llm_preds = [None, None, None]
            
            with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
                futures = {}
                for i, cat in enumerate(CATEGORIES):
                    if bert_preds[i] != kw_preds[i]:
                        futures[executor.submit(classify_review_ollama, row['review'], cat)] = i
                
                for future in as_completed(futures):
                    cat_idx = futures[future]
                    llm_preds[cat_idx] = future.result()
            
            # Vérifier si le LLM valide au moins une des prédictions
            agrees_with_bert = any(llm_preds[i] == bert_preds[i] for i in range(3) if llm_preds[i] is not None)
            agrees_with_kw = any(llm_preds[i] == kw_preds[i] for i in range(3) if llm_preds[i] is not None)
            
            if agrees_with_bert or agrees_with_kw:
                validated_rows.append({
                    **row,
                    'validation_status': 'llm_validated',
                    'llm_handicap': llm_preds[0],
                    'llm_pet': llm_preds[1],
                    'llm_child': llm_preds[2]
                })
            else:
                disputed_rows.append({
                    **row,
                    'validation_status': 'disputed',
                    'llm_handicap': llm_preds[0],
                    'llm_pet': llm_preds[1],
                    'llm_child': llm_preds[2]
                })
    
    # 8. Sauvegarde des résultats
    print("Sauvegarde des résultats...")
    validated_df = pl.DataFrame(validated_rows)
    disputed_df = pl.DataFrame(disputed_rows)
    
    validated_df.write_csv("../data/processed/data_validated/good/validated_reviews.csv")
    disputed_df.write_csv("../data/processed/data_validated/rejected/disputed_reviews.csv")
    
    print(f"\n✓ Pipeline terminée:")
    print(f"  - Reviews validées: {len(validated_rows)}")
    print(f"  - Reviews à vérifier: {len(disputed_rows)}")


In [196]:
if __name__ == "__main__":
    main()

Chargement des données...
Chargement du tokenizer...
Chargement du modèle BERT...
Prédiction BERT avec chunking...


Traitement des reviews: 100%|██████████| 1487/1487 [00:15<00:00, 97.52it/s] 


Comparaison des prédictions...


Validation:   0%|          | 0/1487 [00:00<?, ?it/s]2025-11-19 21:44:38,603 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Validation:   0%|          | 2/1487 [00:00<06:58,  3.55it/s]2025-11-19 21:44:39,086 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Validation:   0%|          | 4/1487 [00:01<06:22,  3.88it/s]2025-11-19 21:44:39,599 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Validation:   0%|          | 5/1487 [00:01<08:14,  3.00it/s]2025-11-19 21:44:40,126 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Validation:   0%|          | 7/1487 [00:02<07:26,  3.31it/s]2025-11-19 21:44:41,953 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Validation:   0%|          | 7/1487 [00:03<13:47,  1.79it/s]


KeyboardInterrupt: 

In [None]:
rows = iter(df.iter_rows(named=True))
for row in tqdm(rows, total=len(df), desc="Validation"):

# Test

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import polars as pl
from tqdm import tqdm

# Configuration
BERT_PATH = "../models/bert-base-uncased"
TOKENIZER_PATH = "bert/bert_tokenizer_pt"
MODEL_WEIGHTS = "bert/best_weights.pth"
MAX_SEQ_LEN = 128
THRESHOLD = 0.95
BATCH_SIZE = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classes = ["handicap", "pet", "child"]


# ===============================
# 1) Définition du modèle
# ===============================
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return self.sigmoid(x)


def split_review_chunks(review_text, max_length=128):
    """Divise une review en chunks de max_length tokens"""
    words = review_text.split()
    chunks = []
    current_chunk = []
    
    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk).split()) >= max_length - 20:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks if chunks else [review_text]


def predict_bert_chunks(reviews, model, tokenizer, threshold=0.95, batch_size=32):
    """Prédit les catégories pour une liste de reviews avec chunking"""
    model.eval()
    all_predictions = []
    
    with torch.no_grad():
        for i in range(0, len(reviews), batch_size):
            batch_reviews = reviews[i:i + batch_size]
            
            # Encoder le batch
            encoded = tokenizer(
                batch_reviews,
                padding="max_length",
                truncation=True,
                max_length=MAX_LENGTH,
                return_tensors="pt"
            )
            
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)
            
            # Prédiction
            pred = model(input_ids=input_ids, attention_mask=attention_mask).cpu().numpy()
            pred_bin = (pred > threshold).astype(int)
            
            all_predictions.extend(pred_bin.tolist())
    
    return all_predictions


def main():
    # 1. Chargement des données
    print("Chargement des données...")
    keywords_df = pl.read_csv("../data/processed/data_categorized/key_words_data_accessiblego.csv")
    original_df = pl.read_csv("../data/original/dataset/data_accessiblego.csv")
    
    # 2. Fusion pour récupérer les reviews originales
    keywords_df = keywords_df.rename({"review": "kw_review"})
    original_reviews = original_df.select(["id", "review"])
    df = keywords_df.join(original_reviews, on="id", how="left")
    df = df.group_by('id').agg(
        pl.col('review').first(), 
        pl.col('category').cast(pl.Utf8).str.join(delimiter=' ')
    )

    # 3. Chargement du modèle BERT
    print("Chargement du tokenizer...")
    tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)
    
    print("Chargement du modèle BERT...")
    model = BertMultiLabelClassifier(n_classes=len(CATEGORIES))
    model.load_state_dict(torch.load(MODEL_WEIGHTS, map_location=device))
    model.to(device)
    model.eval()
    
    # 4. Prédiction BERT sur chunks
    print("Prédiction BERT avec chunking...")
    bert_predictions = []
    
    for review in tqdm(df['review'], desc="Traitement des reviews"):
        chunks = split_review_chunks(str(review), MAX_LENGTH)
        chunk_preds = predict_bert_chunks(chunks, model, tokenizer, THRESHOLD, BATCH_SIZE)
        
        # Agrégation: si un chunk est positif, toute la review l'est
        final_pred = [0, 0, 0]
        for pred in chunk_preds:
            for i in range(3):
                if pred[i]:
                    final_pred[i] = 1
        
        bert_predictions.append(final_pred)
    
    # 5. Conversion des catégories keywords en format binaire
    keyword_preds = []
    for category in df['category']:
        pred = [0, 0, 0]
        if 'handicap' in str(category).lower():
            pred[0] = 1
        if 'pet' in str(category).lower():
            pred[1] = 1
        if 'child' in str(category).lower():
            pred[2] = 1
        keyword_preds.append(pred)
    
    # 6. Ajout des prédictions au DataFrame
    df = df.with_columns([
        pl.Series("bert_handicap", [p[0] for p in bert_predictions]),
        pl.Series("bert_pet", [p[1] for p in bert_predictions]),
        pl.Series("bert_child", [p[2] for p in bert_predictions]),
        pl.Series("kw_handicap", [p[0] for p in keyword_preds]),
        pl.Series("kw_pet", [p[1] for p in keyword_preds]),
        pl.Series("kw_child", [p[2] for p in keyword_preds])
    ])

    print("Sauvegarde des résultats...")
    df.write_csv("test.csv")
    print("Terminé!")


if __name__ == "__main__":
    main()

Chargement des données...
Chargement du tokenizer...
Chargement du modèle BERT...
Prédiction BERT avec chunking...


Traitement des reviews: 100%|██████████| 1487/1487 [00:15<00:00, 98.93it/s] 

Sauvegarde des résultats...
Terminé!





In [176]:
df = pl.read_csv("test.csv")
somme_bert_pet = df.select(pl.col("bert_pet").sum()).item()

print(somme_bert_pet)

40
