# Pipeline Validation

We created a pipeline that combines the results of three methods to have a "cross validation" on our reveiws classifications:

- the result of the keywords extractionn that classified a review in the theme related to the keyword
- the result of a finetunned BERT model on the review enabling better classification
- the result of a small LLM on the review enabling better context understanding

If a review gets classified the same way by each of these 3 methods it is considered valid. 
Either way, it has to be submitted to human validation

In [4]:
import polars as pl
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import ollama
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from pathlib import Path
from tqdm import tqdm
import os
import logging
from dotenv import load_dotenv
import torch.nn as nn
import numpy as np


In [5]:
# Simple loger for pipeline execution
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Filterign HTTP logging
class HttpStatusFilter(logging.Filter):
    def filter(self, record):
        message = record.getMessage()
        if 'HTTP/1.1 200' not in message:
            record.levelname = "WARNING"
            record.levelno = logging.WARNING
        return 'HTTP/1.1 200' not in message
    
logging.getLogger("httpx").addFilter(HttpStatusFilter())

In [None]:
# Global variables
load_dotenv(dotenv_path="../../.env")
NUM_THREAD = int(os.environ.get("NUM_THREADS"))
logger.info(f"NUM_THREAD fixed to {NUM_THREAD}")

2025-11-21 11:09:38,352 - INFO - NUM_THREAD fixed to 8


In [None]:
# Configuration
CATEGORIES = ['handicap', 'pet', 'child']
MAX_LENGTH = 256
BATCH_SIZE = 32
NUM_THREADS = 4
BERT_PATH = "../../models/bert-base-uncased"
TOKENIZER_PATH = "../bert/bert_tokenizer_pt"
MODEL_WEIGHTS = "../bert/best_weights.pth"
THRESHOLD = 0.95


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device utilisation: {device}")

# Création des dossiers de sortie
Path("../../data/processed/data_validated/good").mkdir(parents=True, exist_ok=True)
Path("../../data/processed/data_validated/rejected").mkdir(parents=True, exist_ok=True)

Device utilisation: cuda


In [8]:
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return self.sigmoid(x)

In [None]:
def split_review_chunks(review_text, max_length=128):
    """Divise une review en chunks de max_length tokens"""
    words = review_text.split()
    chunks = []
    current_chunk = []
    
    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk).split()) >= max_length - 20:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks if chunks else [review_text]


def predict_bert_chunks(reviews, model, tokenizer, threshold=0.95, batch_size=32):
    """Prédit les catégories pour une liste de reviews avec chunking"""
    model.eval()
    all_predictions = []
    
    with torch.no_grad():
        for i in range(0, len(reviews), batch_size):
            batch_reviews = reviews[i:i + batch_size]
            
            # Encoder le batch
            encoded = tokenizer(
                batch_reviews,
                padding="max_length",
                truncation=True,
                max_length=MAX_LENGTH,
                return_tensors="pt"
            )
            
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)
            
            # Prédiction
            pred = model(input_ids=input_ids, attention_mask=attention_mask).cpu().numpy()
            pred_bin = (pred > threshold).astype(int)
            
            all_predictions.extend(pred_bin.tolist())
    
    return all_predictions


def classify_review_ollama(review_text, category, model="mistral"):
    """Classification via Ollama"""
    messages = [
        {"role": "system",
         "content": (
             "You are a strict classifier. Your task is to analyze a review and determine whether the "
             f"traveler(s) mentioned in the review have a very specific need in the category: '{category}'. "
             f"Respond strictly with 'yes' if the review indicates they travel with {category}, "
             "or 'no' if not. Your response must be ONE word only, without any explanation or extra text."
         )},
        {"role": "assistant",
         "content": "Understood. I will respond only with 'yes' or 'no', one word."},
        {"role": "user",
         "content": f"Here is the review to analyze:\n\n\"{review_text}\""}
    ]
    
    response = ollama.chat(model=model, messages=messages,messages=messages,options={"temperature": 0})
    answer = response["message"]["content"].strip().lower()
    cleaned = re.sub(r'[^a-z]', '', answer)
    
    return 1 if cleaned == 'yes' else 0

In [10]:
# Loading BERT model
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)
logger.info("BERT tokenizer loaded")

print("Chargement du modèle BERT...")
model = BertMultiLabelClassifier(n_classes=len(CATEGORIES))
model.load_state_dict(torch.load(MODEL_WEIGHTS, map_location=device))
model.to(device)
model.eval()
logger.info("BERT finetunned model loaded")

2025-11-21 11:09:55,934 - INFO - BERT tokenizer loaded


Chargement du modèle BERT...


2025-11-21 11:09:57,176 - INFO - BERT finetunned model loaded


In [None]:
def main():
    
    # Loading data
    logger.info("Begining of the pipeline")
    keywords_df = pl.read_csv("../../data/processed/data_categorized/key_words_data_accessiblego.csv")
    original_df = pl.read_csv("../../data/original/dataset/data_accessiblego.csv")
    logger.info("Data loaded")
    
    # Fusion to get the original reviews
    keywords_df = keywords_df.rename({"review": "kw_review"})
    original_reviews = original_df.select(["id", "review"])
    df = keywords_df.join(original_reviews, on="id", how="left")
    df = df.group_by('id').agg(
        pl.col('review').first(), 
        pl.col('category').cast(pl.Utf8).str.join(delimiter=' ')
    )

    # BERT prediction on chunks
    logger.info("BERT prediction with chunking...")
    bert_predictions = []
    
    for review in tqdm(df['review'], desc="Review Processing"):
        chunks = split_review_chunks(str(review), MAX_LENGTH)
        chunk_preds = predict_bert_chunks(chunks, model, tokenizer, THRESHOLD, BATCH_SIZE)
        
        # Agregation: if a chunk is positive the while review is
        final_pred = [0, 0, 0]
        for pred in chunk_preds:
            for i in range(3):
                if pred[i]:
                    final_pred[i] = 1
        
        bert_predictions.append(final_pred)
    
    # Conversion of keywords extraction category to one hot format
    keyword_preds = []
    for category in df['category']:
        pred = [0, 0, 0]
        if 'handicap' in str(category).lower():
            pred[0] = 1
        if 'pet' in str(category).lower():
            pred[1] = 1
        if 'child' in str(category).lower():
            pred[2] = 1
        keyword_preds.append(pred)
    
    # Adding predicitons to the dataframe
    df = df.with_columns([
        pl.Series("kw_handicap", [p[0] for p in keyword_preds]),
        pl.Series("kw_pet", [p[1] for p in keyword_preds]),
        pl.Series("kw_child", [p[2] for p in keyword_preds]),
        pl.Series("bert_handicap", [p[0] for p in bert_predictions]),
        pl.Series("bert_pet", [p[1] for p in bert_predictions]),
        pl.Series("bert_child", [p[2] for p in bert_predictions])  
    ])
    
    # Comparaison and filtering
    logger.info("Prediction comparaison")
    validated_rows = []
    disputed_rows = []
    
    rows = iter(df.iter_rows(named=True))
    for row in tqdm(rows, total=len(df), desc="Validation"):
        bert_preds = [row['bert_handicap'], row['bert_pet'], row['bert_child']]
        kw_preds = [row['kw_handicap'], row['kw_pet'], row['kw_child']]
        
        if bert_preds == kw_preds:
            # Agreement between kw and BERT
            validated_rows.append({
                **row,
                'validation_status': 'agreed',
                'llm_handicap':row['kw_handicap'],
                'llm_pet': row['kw_pet'],
                'llm_child': row['kw_child']
            })
        else:
            # Desagreement: ask LLM
            llm_preds = [None, None, None]
            
            with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
                futures = {}
                for i, cat in enumerate(CATEGORIES):
                    if bert_preds[i] != kw_preds[i]:
                        futures[executor.submit(classify_review_ollama, row['review'], cat)] = i
                    else:
                        llm_preds[i]= kw_preds[i]
                
                for future in as_completed(futures):
                    cat_idx = futures[future]
                    llm_preds[cat_idx] = future.result()
            
            # Verify if a the LLM agrees with keywords predictions
            agrees_with_kw = any(llm_preds[i] == kw_preds[i] for i in range(3) if llm_preds[i] is not None)
            
            #if agrees_with_kw:
            if agrees_with_kw:
                validated_rows.append({
                    **row,
                    'validation_status': 'llm_validated',
                    'llm_handicap': llm_preds[0],
                    'llm_pet': llm_preds[1],
                    'llm_child': llm_preds[2]
                })
            else:
                disputed_rows.append({
                    **row,
                    'validation_status': 'disputed',
                    'llm_handicap': llm_preds[0],
                    'llm_pet': llm_preds[1],
                    'llm_child': llm_preds[2]
                })
    
    # Saving results
    logger.info("Saving results...")
    validated_df = pl.DataFrame(validated_rows)
    disputed_df = pl.DataFrame(disputed_rows)
    
    validated_df.write_csv("../../data/processed/data_validated/good/validated_reviews.csv")
    disputed_df.write_csv("../../data/processed/data_validated/rejected/disputed_reviews.csv")
    
    logger.info("Pipeline ended")
    logger.info(f"  - Reviews validated: {len(validated_rows)}")
    logger.info(f"  - Reviews to validate: {len(disputed_rows)}")


In [None]:
if __name__ == "__main__":
    main()

2025-11-20 18:28:00,634 - INFO - Begining of the pipeline
2025-11-20 18:28:00,651 - INFO - Data loaded
2025-11-20 18:28:00,673 - INFO - BERT prediction with chunking...
Review Processing: 100%|██████████| 1487/1487 [00:52<00:00, 28.33it/s]
2025-11-20 18:28:53,174 - INFO - Prediction comparaison
Validation: 100%|██████████| 1487/1487 [08:03<00:00,  3.07it/s]
2025-11-20 18:36:56,855 - INFO - Saving results...
2025-11-20 18:36:56,872 - INFO - Pipeline ended
2025-11-20 18:36:56,872 - INFO -   - Reviews validated: 1486
2025-11-20 18:36:56,872 - INFO -   - Reviews to validate: 1


## Bout de code uniquement pour faire tourner BERt sans le LLM et améliorer le fine tuning du modèle (à supprimer à la fin)

In [None]:
def main_amelioration_finetuning():
    
    # Loading data
    logger.info("Begining of the pipeline")
    keywords_df = pl.read_csv("../../data/processed/data_categorized/key_words_data_hotel_reviews_2_cleaned.csv")
    original_df = pl.read_csv("../../data/original/dataset/data_hotel_reviews_2.csv")
    logger.info("Data loaded")
    
    # Fusion to get the original reviews
    keywords_df = keywords_df.rename({"review": "kw_review"})
    original_reviews = original_df.select(["id", "review"])
    df = keywords_df.join(original_reviews, on="id", how="left")
    df = df.group_by('id').agg(
        pl.col('review').first(), 
        pl.col('category').cast(pl.Utf8).str.join(delimiter=' ')
    )

    # BERT prediction on chunks
    logger.info("BERT prediction with chunking...")
    bert_predictions = []
    
    for review in tqdm(df['review'], desc="Review Processing"):
        chunks = split_review_chunks(str(review), MAX_LENGTH)
        chunk_preds = predict_bert_chunks(chunks, model, tokenizer, THRESHOLD, BATCH_SIZE)
        
        # Agregation: if a chunk is positive the while review is
        final_pred = [0, 0, 0]
        for pred in chunk_preds:
            for i in range(3):
                if pred[i]:
                    final_pred[i] = 1
        
        bert_predictions.append(final_pred)
    
    # Conversion of keywords extraction category to one hot format
    keyword_preds = []
    for category in df['category']:
        pred = [0, 0, 0]
        if 'handicap' in str(category).lower():
            pred[0] = 1
        if 'pet' in str(category).lower():
            pred[1] = 1
        if 'child' in str(category).lower():
            pred[2] = 1
        keyword_preds.append(pred)
    
    # Adding predicitons to the dataframe
    df = df.with_columns([
        pl.Series("kw_handicap", [p[0] for p in keyword_preds]),
        pl.Series("kw_pet", [p[1] for p in keyword_preds]),
        pl.Series("kw_child", [p[2] for p in keyword_preds]),
        pl.Series("bert_handicap", [p[0] for p in bert_predictions]),
        pl.Series("bert_pet", [p[1] for p in bert_predictions]),
        pl.Series("bert_child", [p[2] for p in bert_predictions])  
    ])
    
    # Comparaison and filtering
    logger.info("Prediction comparaison")
    validated_rows = []
    disputed_rows = []
    
    rows = iter(df.iter_rows(named=True))
    for row in tqdm(rows, total=len(df), desc="Validation"):
        bert_preds = [row['bert_handicap'], row['bert_pet'], row['bert_child']]
        kw_preds = [row['kw_handicap'], row['kw_pet'], row['kw_child']]
        
        if bert_preds == kw_preds:
            # Agreement between kw and BERT
            validated_rows.append({
                **row,
                'validation_status': 'agreed',
            })

        else:
            disputed_rows.append({
                **row,
                'validation_status': 'disputed',
            })
    
    # Saving results
    logger.info("Saving results...")
    disputed_df = pl.DataFrame(disputed_rows)

    
    disputed_df.write_csv("reviews_à_regarder_hotel_reviews_2.csv")
    
    logger.info("Pipeline ended")
    logger.info(f"  - Reviews validated: {len(disputed_df)}")


In [14]:
if __name__ == "__main__":
    main_amelioration_finetuning()

2025-11-21 11:11:50,871 - INFO - Begining of the pipeline
2025-11-21 11:11:50,960 - INFO - Data loaded
2025-11-21 11:11:50,979 - INFO - BERT prediction with chunking...
Review Processing: 100%|██████████| 1292/1292 [00:11<00:00, 109.77it/s]
2025-11-21 11:12:02,751 - INFO - Prediction comparaison
Validation: 100%|██████████| 1292/1292 [00:00<00:00, 732500.78it/s]
2025-11-21 11:12:02,755 - INFO - Saving results...
2025-11-21 11:12:02,755 - INFO - Pipeline ended
2025-11-21 11:12:02,755 - INFO -   - Reviews validated: 383
