# Evaluation classification methods

This notebook aims to evaluate on the same 200 labeeled reviews the 3 following methods of classification used ithin our pipelines :

- Keywords extractions
- BERT model finetunned
- LLM Mistral Small 7B

In [None]:
import torch
import re 
import ollama
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score,accuracy_score
import pandas as pd
from transformers import BertTokenizer, BertModel
import logging
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor
import os

In [None]:
# Simple loger for pipeline execution
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Filterign HTTP logging
class HttpStatusFilter(logging.Filter):
    def filter(self, record):
        message = record.getMessage()
        if 'HTTP/1.1 200' not in message:
            record.levelname = "WARNING"
            record.levelno = logging.WARNING
        return 'HTTP/1.1 200' not in message
    
logging.getLogger("httpx").addFilter(HttpStatusFilter())

In [None]:
# Global variables
load_dotenv(dotenv_path="../../.env")
NUM_THREAD = int(os.environ.get("NUM_THREADS"))
logger.info(f"NUM_THREAD fixed to {NUM_THREAD}")

In [None]:
# Categories
classes = ["handicap", "pet", "child"]

# Test data loading
logger.info("Loading test data...")
df = pd.read_csv("../../data/original/fine_tunning/data_test.csv")
y_true = df[classes].values.astype(int)

### Evaluation of Keyword extraction

In [None]:
# Load datasets
df_truth = df 
df_kw = pd.read_csv("../../data/processed/data_categorized/key_words_data_test.csv")

# Combine keywords per review (each review now has a unique id)
df_kw = (
    df_kw.groupby("id", as_index=False)
         .agg({
             "review": "first",
             "category": lambda x: " ".join(x.astype(str))
         })
)

# Initialize predictions at 0
df_pred = pd.DataFrame(0, index=df_truth.index, columns=classes)
df_pred["id"] = df_truth["id"]

# Prediction using keywords
df_pred = df_pred.set_index("id")

for _, row in df_kw.iterrows():
    review_id = row["id"]
    cat_list = str(row["category"]).strip().lower().split()  # split in case multiple categories concatenated
    for cat in cat_list:
        if cat in classes and review_id in df_pred.index:
            df_pred.at[review_id, cat] = 1

df_pred = df_pred.reset_index()  # restore id column

# Align truth and prediction
df_truth = df_truth.sort_values("id").reset_index(drop=True)
df_pred  = df_pred.sort_values("id").reset_index(drop=True)

y_true = df_truth[classes].values
y_pred = df_pred[classes].values


# Compute metrics per class
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")

# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


### Evaluation of BERT model

In [None]:
# Parameters
BERT_PATH = "../../models/bert-base-uncased"
TOKENIZER_PATH = "../bert/bert_tokenizer_pt"
MODEL_WEIGHTS = "../bert/best_weights_v3.pth"
MAX_SEQ_LEN = 256
threshold = 0.95
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model definition
class BertMultiLabelClassifier(nn.Module):
    def __init__(self, n_classes, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(outputs.last_hidden_state, dim=1)
        x = self.dropout(pooled_output)
        x = self.classifier(x)
        return self.sigmoid(x)


# Load model and tokenizer
logger.info("Loading tokenizer...")
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)

logger.info("Loading model...")
model = BertMultiLabelClassifier(n_classes=len(classes))
model.load_state_dict(torch.load(MODEL_WEIGHTS, map_location=device))
model.to(device)
model.eval()

# Encodign function
def encode_batch(sentences):
    encoded = tokenizer(
        list(sentences),
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_tensors="pt"
    )
    return encoded["input_ids"], encoded["attention_mask"]


# Prediction
logger.info("Predicting...")
input_ids, attention_mask = encode_batch(df["review"])

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

with torch.no_grad():
    pred = model(input_ids=input_ids, attention_mask=attention_mask).cpu().numpy()

y_pred_bin = (pred > threshold).astype(int)


# Metrics
logger.info("Metrics multilabel")

for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred_bin[:, i])
    prec = precision_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred_bin[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred_bin, average="micro", zero_division=0)
rec_micro = recall_score(y_true, y_pred_bin, average="micro", zero_division=0)
f1_micro = f1_score(y_true, y_pred_bin, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred_bin, average="macro", zero_division=0)
rec_macro = recall_score(y_true, y_pred_bin, average="macro", zero_division=0)
f1_macro = f1_score(y_true, y_pred_bin, average="macro", zero_division=0)

logger.info("Global metrics")
print(f"Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f"Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


### Evaluation of LLM Mistral Small 7B

In [None]:
def classify_review_ollama(review_text, category, model="mistral"):
    """Classification via Ollama"""
    messages = [
        {"role": "system",
         "content": (
             "You are a strict classifier. Your task is to analyze a review and determine whether the "
             f"traveler(s) mentioned in the review have a very specific need in the category: '{category}'. "
             f"Respond strictly with 'yes' if the review indicates they travel with {category}, "
             "or 'no' if not. Your response must be ONE word only, without any explanation or extra text."
         )},
        {"role": "assistant",
         "content": "Understood. I will respond only with 'yes' or 'no', one word."},
        {"role": "user",
         "content": f"Here is the review to analyze:\n\n\"{review_text}\""}
    ]
    
    response = ollama.chat(model=model, messages=messages,options={"temperature": 0})
    answer = response["message"]["content"].strip().lower()
    cleaned = re.sub(r'[^a-z]', '', answer)
    
    return 1 if cleaned == 'yes' else 0

def classify_all_categories(review):
    return [classify_review_ollama(review, category) for category in classes]

# Prediction
with ThreadPoolExecutor(max_workers=NUM_THREAD) as executor:
    y_pred = list(executor.map(classify_all_categories, df["review"]))

# Convertir en matrice numpy si besoin :
import numpy as np
y_pred = np.array(y_pred, dtype=int)

# Metrics
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


# Evaluation prompt few shot mistral small 7B

In [None]:
def classify_review_ollama(review_text, category, model="mistral"):
    """Classification with Ollama """
    
    if category == "child":
        messages = [
            {"role": "system",
             "content": (
                  "You are a strict family-review classifier. Your task is to analyze a review and determine "
                 "whether the traveler(s) are traveling with children. Especially, you need to determine"
                 " if these children have a high chance to be under 18 years old."
                 "Respond strictly with 'yes' if the review indicates people travelling with children, or 'no' if not. "
                 "ONE word only, no explanations or extra text."
             )},
            {"role": "assistant",
             "content": "Understood. I will respond only with 'yes' or 'no', one word."},
            {"role": "user",
             "content": (
                 "Here are some examples:\n"
                 "Review: \"We traveled with our kids and loved the family-friendly pool.\" -> yes\n"
                 "Review: \"The hotel was great, but we went alone as a couple.\" -> no\n"
                 "Review: \"I got there at 6:30, and a kid that apparently worked there (no id/uniform) was scrambling to set everything up\" -> no\n"
                 "Review: \"My Grand kids loved the pool\" -> yes\n"
                 "Review: \"I travelled to Dakota to see my son graduatation\" -> no\n"
                 "Review: \"This family owned business has a welcoming staff which made us feel right at home\" -> no\n"
                 "Review: \"If I had to ask one thing of Best Western, please replace the mattresses or box springs every time our kids moved at night\" -> yes\n\n"
                 f"Now classify this review:\n\"{review_text}\""
             )}
        ]
        
    elif category == "pet":
        messages = [
            {"role": "system",
             "content": (
                 "You are a strict pet-friendly-review classifier. Your task is to analyze a review and determine "
                 "whether the traveler(s) are traveling with pets. "
                 "Respond strictly with 'yes' if the review indicates they travel with pets, or 'no' if not. "
                 "ONE word only, no explanations or extra text."
             )},
            {"role": "assistant",
             "content": "Understood. I will respond only with 'yes' or 'no', one word."},
            {"role": "user",
             "content": (
                 "Here are some examples:\n"
                 "Review: \"Thanks again Cat!\" -> no\n"
                 "Review: \"I only booked this hotel because it was dog friendly\" -> yes\n"
                 "Review: \"I wanted to see if I could bring my service dog with me but they told me it was impossible at the front desk.\" -> yes\n"
                 "Review: \"The bedsheets were smelling cat urine. Horrible !\" -> no\n"
                 "Review: \"Perfect for travelers with cats or dogs.\" -> yes\n\n"
                 f"Now classify this review:\n\"{review_text}\""
             )}
        ]
        
    elif category == "handicap":
        messages = [
            {"role": "system",
             "content": (
                 "You are a strict business-travel-review classifier. Your task is to analyze a review and determine "
                 "whether the traveler(s) have any type of handicap or if the reviews contains a specific needs"
                 "associated with a disability (transporations, amenities, etc.) "
                 "Respond strictly with 'yes' if the review indicates a handicaped traveler or a special need related to handicap travelling, or 'no' if not. "
                 "ONE word only, no explanations or extra text."
             )},
            {"role": "assistant",
             "content": "Understood. I will respond only with 'yes' or 'no', one word."},
            {"role": "user",
             "content": (
                 "Here are some examples:\n"
                 "Review: \"Plant to go to London in September Need information about Accessible Van in London airport\" -> yes\n"
                 "Review: \"The room was great, big enough to move around in my power chair in both the bedroom and bathroom\" -> yes\n"
                 "Review: \"I would like to sell my wheelchair.please contact me\" -> no\n"
                 "Review: \"It's new digital travel magazine targeted exclusively for travelers with disabilities.\" -> no\n"
                 "Review: \"Nice roll-in shower with a pull-down bench, but the amenities were again too high\" -> yes\n\n"
                 f"Now classify this review:\n\"{review_text}\""
             )}
        ]
        
    else:
        raise ValueError(f"Unknown category: {category}")
    
    # Ici tu peux directement envoyer `messages` à Ollama
    response = ollama.chat(model=model, messages=messages, options={"temperature": 0})
    answer = response["message"]["content"].strip().lower()
    cleaned = re.sub(r'[^a-z]', '', answer)
    
    return 1 if cleaned == 'yes' else 0


def classify_all_categories(review):
    return [classify_review_ollama(review, category) for category in classes]

# Prediction
with ThreadPoolExecutor(max_workers=NUM_THREAD) as executor:
    y_pred = list(executor.map(classify_all_categories, df["review"]))

# Convertir en matrice numpy si besoin :
import numpy as np
y_pred = np.array(y_pred, dtype=int)

# Metrics
for i, label in enumerate(classes):
    acc = accuracy_score(y_true[:, i], y_pred[:, i])
    prec = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
    rec = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
    f1 = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)

    print(f"Label: {label}")
    print(f" Accuracy : {acc:.4f}")
    print(f" Precision: {prec:.4f}")
    print(f" Recall   : {rec:.4f}")
    print(f" F1-score : {f1:.4f}\n")


# Global metrics
prec_micro = precision_score(y_true, y_pred, average="micro", zero_division=0)
rec_micro  = recall_score(y_true, y_pred, average="micro", zero_division=0)
f1_micro   = f1_score(y_true, y_pred, average="micro", zero_division=0)

prec_macro = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec_macro  = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1_macro   = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("Global metrics:")
print(f" Micro Precision: {prec_micro:.4f}, Recall: {rec_micro:.4f}, F1: {f1_micro:.4f}")
print(f" Macro Precision: {prec_macro:.4f}, Recall: {rec_macro:.4f}, F1: {f1_macro:.4f}")


## Evluation using Human validation

In [75]:
import pandas as pd
from pathlib import Path

FINAL_PATH = Path("../../data/processed/final")
VALIDATED_PATH = Path("../../data/processed/data_validated")
CATEGORIES = ["child", "pet", "handicap"]

def load_human_labels():
    rows = []
    for cat in CATEGORIES:
        cat_dir = FINAL_PATH / cat
        for file in cat_dir.glob("*.csv"):
            df = pd.read_csv(file)
            if "review" not in df.columns.tolist():
                print(file.name)
            truth = 1 if "good" in file.stem else 0
            df["human_truth"] = truth
            df["category"] = cat
            rows.append(df[["id","review","human_truth","category"]])
    return pd.concat(rows, ignore_index=True)

def load_validated_data():
    dfs = []
    for cat in CATEGORIES:
        for file in (VALIDATED_PATH / cat).glob("*.csv"):
            df = pd.read_csv(file)
            df["category"] = cat
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

human_df = load_human_labels()
validated_df = load_validated_data()

merged = human_df.merge(validated_df, on=["id", "category"], how="left")
merged_unique = merged.drop_duplicates(subset=["id", "review_x","category"])
merged_unique = merged_unique.rename(columns={"review_x": "review"})
merged_unique = merged_unique.drop(columns=["review_y"])

final_cols = [
    "id", "review", "category", "human_truth", "validation_status",
    "llm_child", "llm_pet", "llm_handicap",
    "bert_child", "bert_pet", "bert_handicap",
    "kw_child", "kw_pet", "kw_handicap"
]

final_dataset = merged_unique[final_cols]
final_dataset["review"] = final_dataset["review"].str.replace("\n", " ", regex=False)
final_dataset["review"] = final_dataset["review"].str.replace("\r", " ", regex=False)

print(f"NB REVIEWS : {len(final_dataset)}")

#final_dataset.to_csv("merged_final_dataset.csv", index=False)


NB REVIEWS : 2920


In [76]:
def compute_statistics(df: pd.DataFrame):
    
    stats = {}

    # 1️⃣ Nombre total de reviews
    stats["total_reviews"] = len(df)

    # 2️⃣ Nombre de reviews par catégorie + proportion
    cat_counts = df["category"].value_counts()
    cat_props = df["category"].value_counts(normalize=True)
    stats["reviews_per_category"] = pd.DataFrame({
        "count": cat_counts,
        "proportion": cat_props
    })

    # 3️⃣ Nombre de reviews par validation_status + proportion
    val_counts = df["validation_status"].value_counts()
    val_props = df["validation_status"].value_counts(normalize=True)
    stats["reviews_per_validation_status"] = pd.DataFrame({
        "count": val_counts,
        "proportion": val_props
    })

    # 4️⃣ Nombre de reviews par validation_status pour chaque catégorie + proportion par catégorie
    stats["reviews_per_status_per_category"] = (
        df.groupby(["category", "validation_status"])
          .size()
          .to_frame("count")
          .groupby(level=0)
          .apply(lambda x: x.assign(proportion=x["count"]/x["count"].sum()))
    )

    return stats

In [77]:
def display_statistics(stats: dict):
    print("=== STATISTICS ===\n")
    
    # Nombre total de reviews
    print(f"Total Reviews: {stats['total_reviews']}\n")
    
    # Reviews par catégorie
    print("Reviews per Category:")
    display(stats['reviews_per_category'].sort_index())
    print("\n")
    
    # Reviews par validation_status
    print("Reviews per Validation Status:")
    display(stats['reviews_per_validation_status'].sort_index())
    print("\n")
    
    # Reviews par validation_status par catégorie
    print("Reviews per Validation Status per Category:")
    display(stats['reviews_per_status_per_category'])
    print("\n")

In [78]:
statistics = compute_statistics(final_dataset)
display_statistics(statistics)

=== STATISTICS ===

Total Reviews: 2920

Reviews per Category:


Unnamed: 0_level_0,count,proportion
category,Unnamed: 1_level_1,Unnamed: 2_level_1
child,1397,0.478425
handicap,755,0.258562
pet,768,0.263014




Reviews per Validation Status:


Unnamed: 0_level_0,count,proportion
validation_status,Unnamed: 1_level_1,Unnamed: 2_level_1
agreed,2222,0.760959
disputed,66,0.022603
llm_validated,632,0.216438




Reviews per Validation Status per Category:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,proportion
category,category,validation_status,Unnamed: 3_level_1,Unnamed: 4_level_1
child,child,agreed,1070,0.765927
child,child,disputed,14,0.010021
child,child,llm_validated,313,0.224052
handicap,handicap,agreed,529,0.700662
handicap,handicap,disputed,22,0.029139
handicap,handicap,llm_validated,204,0.270199
pet,pet,agreed,623,0.811198
pet,pet,disputed,30,0.039062
pet,pet,llm_validated,115,0.14974






In [None]:
def compute_pipeline_metrics(df: pd.DataFrame):
    metrics = {}

    total_reviews = len(df)

    # -------------------------
    # 1️⃣ Premier layer : Keywords
    # -------------------------
    kw_correct = (
        ((df["kw_child"] == df["llm_child"]) & (df["human_truth"] == 1)) &
        ((df["kw_pet"] == df["llm_pet"]) & (df["human_truth"] == 1)) &
        ((df["kw_handicap"] == df["llm_handicap"]) & (df["human_truth"] == 1))
    )
    metrics["keywords_precision"] = kw_correct.sum() / total_reviews
    metrics["keywords_coverage"] = total_reviews / total_reviews

    # -------------------------
    # 2️⃣ Second layer : BERT validation
    # -------------------------
    bert_correct = (
        ((df["bert_child"] == df["llm_child"]) & (df["human_truth"] == 1)) &
        ((df["bert_pet"] == df["llm_pet"]) & (df["human_truth"] == 1)) &
        ((df["bert_handicap"] == df["llm_handicap"]) & (df["human_truth"] == 1))
    )
    bert_correct_kw_incorrect = (
        ((df["kw_child"] != df["llm_child"]) & (df["bert_child"] == df["llm_child"]) & (df["human_truth"] == 1)) &
        ((df["kw_pet"] != df["llm_pet"]) & (df["bert_pet"] == df["llm_pet"]) & (df["human_truth"] == 1)) &
        ((df["kw_handicap"] != df["llm_handicap"]) & (df["bert_handicap"] == df["llm_handicap"]) & (df["human_truth"] == 1))
    )
    metrics["bert_layer_precision"] = bert_correct.sum() / total_reviews
    metrics["bert_layer_gain"] = bert_correct_kw_incorrect.sum() / total_reviews
    metrics["bert_layer_coverage"] = total_reviews / total_reviews

    # -------------------------
    # 3️⃣ Third layer : LLM validation
    # -------------------------
    correct = (df["human_truth"] == 1)
    llm_correct = (df["validation_status"].isin(["llm_validated", "disputed"]) & (df["human_truth"] == 1))
    metrics["llm_layer_precision"] = correct.sum() / total_reviews
    metrics["llm_layer_coverage"] = len(df[df["validation_status"].isin(["llm_validated", "disputed"])]) / total_reviews
    metrics['llm_layer_gain'] = llm_correct.sum() / total_reviews

    # LLM préférences quand ce n'est pas "agreed"
    mask = df["validation_status"] != "agreed"
    llm_pref = {}

    llm_pref["agree_kw_percentage"] = (
        ((df["kw_child"] == df["llm_child"]) & mask) &
        ((df["kw_pet"] == df["llm_pet"]) & mask) &
        ((df["kw_handicap"] == df["llm_handicap"]) & mask)
    ).sum() / mask.sum()
    
    llm_pref["agree_bert_percentage"] = (
        ((df["bert_child"] == df["llm_child"]) & mask) &
        ((df["bert_pet"] == df["llm_pet"]) & mask) &
        ((df["bert_handicap"] == df["llm_handicap"]) & mask)
    ).sum() / mask.sum()
    
    llm_pref["disagree_both_percentage"] = (
        mask.sum() - llm_pref["agree_kw_percentage"] * mask.sum() - llm_pref["agree_bert_percentage"] * mask.sum()
    ) / mask.sum()

    metrics["llm_preferences_counts"] = llm_pref

    # -------------------------
    # 4️⃣ Gains pipeline
    # -------------------------
    metrics["pipeline_added_value"] = (bert_correct_kw_incorrect.sum() + llm_correct.sum()) / total_reviews
    metrics["final_pipeline_precision"] = (df["human_truth"] == 1).sum() / total_reviews

    return metrics


def display_pipeline_metrics(metrics: dict):
    print("=== PIPELINE METRICS ===\n")
    print(f"Keywords layer: Layer Precision = {metrics['keywords_precision']:.3%}, Coverage = {metrics['keywords_coverage']:.3%}")
    print(f"BERT layer: Layer Precision = {metrics['bert_layer_precision']:.3%}, Coverage = {metrics['bert_layer_coverage']:.3%}, Layer Gain = {metrics['bert_layer_gain']:.3%}")
    print(f"LLM layer: Layer Precision = {metrics['llm_layer_precision']:.3%}, Coverage = {metrics['llm_layer_coverage']:.3%}, Layer Gain = {metrics['llm_layer_gain']:.3%}")

    print("\nLLM preferences when BERT and Keywords disagree:")
    for k, v in metrics['llm_preferences_counts'].items():
        print(f"  - {k}: {v:.3%}" if isinstance(v, float) else f"  - {k}: {v}")
    
    print(f"\nFinal pipeline: Overall Precision = {metrics['final_pipeline_precision']:.3%}")
    print(f"Final Gain of the pipeline: Gain = {metrics['pipeline_added_value']:.3%}")

In [94]:
display_pipeline_metrics(compute_pipeline_metrics(final_dataset))

=== PIPELINE METRICS ===

Keywords layer: Layer Precision = 63.219%, Coverage = 100.000%
BERT layer: Layer Precision = 51.404%, Coverage = 100.000%, Layer Gain = 0.000%
LLM layer: Layer Precision = 63.904%, Coverage = 23.904%, Layer Gain = 12.500%

LLM preferences when BERT and Keywords disagree:
  - agree_kw_percentage: 90.544%
  - agree_bert_percentage: 6.734%
  - disagree_both_percentage: 2.722%

Final pipeline: Overall Precision = 63.904%
Final Gain of the pipeline: Gain = 12.500%
