# Benchmarking [GPT-o1-mini](https://openai.com/index/introducing-o3-and-o4-mini/)

o1/o3/o4 -mini fail !

## Libraries

In [35]:
import openai
from time import sleep 
from pathlib import Path
import numpy as np
import json
import os
import pandas as pd
from tqdm.rich import tqdm
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_score, recall_score, f1_score, accuracy_score
)
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.text import Text
import matplotlib.pyplot as plt
import warnings
from google.api_core.exceptions import ServiceUnavailable
from google.api_core import exceptions as genai_errors
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

tqdm.pandas(desc="Fetching moderation scores")
console = Console()
warnings.filterwarnings("ignore")

## Global variables

In [36]:
ROOT = Path("..")
DATA_DIR = ROOT / "data"
BENCHMARK_PATH = DATA_DIR / "benchmark" / "benchmark_balanced_subset.csv"
output_path = DATA_DIR / "benchmark" / "o1-mini.csv"
API_KEY_PATH = DATA_DIR / "confidential" / "GPT_API.txt"
console = Console()
model = "o1-mini"
system_prompt = (Path(".") / "API_SYSTEM_PROMPT.txt").read_text().strip()
prompt = (Path(".") / "API_PROMPT.txt").read_text().strip()

In [37]:
os.environ["HTTP_PROXY"] = "socks5h://127.0.0.1:1080"
os.environ["HTTPS_PROXY"] = "socks5h://127.0.0.1:1080"

In [38]:
openai.api_key = API_KEY_PATH.read_text().strip()

## Load dataset

In [39]:
df = pd.read_csv(BENCHMARK_PATH, encoding="utf-8")
df = df.dropna(subset=["content", "label"])
df["label"] = df["label"].astype(int)

label_counts = df["label"].value_counts().reset_index()
label_counts.columns = ["label", "count"]
table = Table(title="Label Counts", show_lines=True)
table.add_column("Label", justify="center", style="cyan")
table.add_column("Count", justify="center", style="yellow")
table.add_row("Total Rows", str(len(df)))
for _, row in label_counts.iterrows():
    table.add_row(str(row["label"]), str(row["count"]))
console.print(table, justify="left")
df.sample(5, random_state=42)

Unnamed: 0,msg_id,content,label
650,anon_msg_5ff0a5549f65,"des bovins mon pote, j'ai jamais vu ça sauf en...",0
208,anon_msg_2ea521264d58,Cet accent du sud dégueulasse,1
427,anon_msg_99e41015a911,"Joffrin, je serais capable de faire des trucs ...",1
677,anon_msg_31edc839b8b8,"D'accord le julf, mais vous n'êtes pas BLANC.",1
1272,anon_msg_15c39b3b7e9a,C'est Siemens Espagne hein... Pas le PDG du gr...,0


## Define prediction function

In [40]:
def create_batch_file(
        text: str,
        MODEL: str,
        max_tokens: int = 20,
    ):
    input_text = prompt + f"« {text} »" + "\n Ce message est-il toxique ?\n"
    return {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": input_text}
        ],
        "max_tokens": max_tokens
        }

In [41]:
def create_json_batch(df, MODEL, batch_path):

    json_batch = {}

    for i, row in df.iterrows():
        text = row["content"]
        if text in json_batch:
            continue

        json_batch[text] = {
            "custom_id": f"id_{i}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": create_batch_file(
                text=text,
                MODEL=MODEL
            )
        }

    batch_path.parent.mkdir(parents=True, exist_ok=True)
    with batch_path.open("w", encoding="utf-8") as f:
        for entry in json_batch.values():
            f.write(json.dumps(entry) + "\n")
    console.print(f"Batch file written to {batch_path}")

In [42]:
def upload_batch(batch_path) -> str:
    # Avoid double use of live context from outer script
    file = open(batch_path, "rb")
    uploaded = openai.files.create(file=file, purpose="batch")
    uploaded_file_id = uploaded.id
    console.print(f"[green]✔ Uploaded batch file. ID: [bold]{uploaded_file_id}[/bold]")
    return uploaded_file_id

In [43]:
def submit_batch(uploaded_file_id) -> str:
    if not uploaded_file_id:
        raise ValueError("Batch file not uploaded. Call upload_batch() first.")
    batch = openai.batches.create(
        input_file_id=uploaded_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )
    console.print(f"[green]✔ Batch submitted. ID: [bold]{batch.id}[/bold]")
    return batch.id

In [44]:
def wait_for_completion(batch_id):
    assert batch_id is not None, "No batch submitted."
    elapsed = 0
    while True:
        batch = openai.batches.retrieve(batch_id)
        console.print(f"[yellow] Batch status (after {elapsed}s): [bold]{batch.status}[/bold]")
        if batch.status in ["completed", "failed", "expired"]:
            return batch
        sleep(60)
        elapsed += 60

In [45]:
def download_and_parse_results(batch, output_path) -> pd.DataFrame:
    if batch.status != "completed":
        raise RuntimeError(f"Batch did not complete: {batch.status}")

    output_id = batch.output_file_id
    content = openai.files.content(output_id).read()
    output_path.write_bytes(content)

    results = []
    for line in content.decode("utf-8").splitlines():
        data = json.loads(line)
        if data.get("error"):
            console.print(f"[red] Error for {data['custom_id']}: {data['error']}")
            continue
        id = data["custom_id"]
        choice = data["response"]["body"]["choices"][0]["message"]["content"].strip()
        usage = data["response"]["body"].get("usage", {})
        results.append({
            "id": id,
            "conversation": choice,
        })
    return pd.DataFrame(results)

In [46]:
def pipeline_for(df):
    create_json_batch(
        df,
        model,
        Path(f"batch_{model}.jsonl")
    )
    uploaded_file_id = upload_batch(Path(f"batch_{model}.jsonl"))
    batch_id = submit_batch(uploaded_file_id)
    batch = wait_for_completion(batch_id)
    df_answers = download_and_parse_results(batch, Path(f"answers_{model}.jsonl"))
    return df_answers

In [47]:
answers = pipeline_for(df)

KeyboardInterrupt: 

In [41]:
answers

[         id conversation
 0    id_139      toxique
 1    id_140  non-toxique
 2    id_141      toxique
 3    id_142      toxique
 4    id_143      toxique
 ..      ...          ...
 134  id_273  non-toxique
 135  id_274      toxique
 136  id_275  non-toxique
 137  id_276  non-toxique
 138  id_277      toxique
 
 [139 rows x 2 columns]]

In [45]:
N = 10 
# Divide the DataFrame into N equal parts
dfs = np.array_split(df, N)
answers = []
for df_part in tqdm(dfs, desc="Processing batches"):
    answers.append(pipeline_for(df_part))

Output()

KeyboardInterrupt: 

## Run prediction

In [None]:
# List of texts to classify
texts = df["content"].tolist()
results = []

# You can tune max_workers (4–8 is often a good balance)
with ThreadPoolExecutor(max_workers=6) as executor:
    future_to_text = {executor.submit(safe_predict, t): t for t in texts}
    for future in tqdm(as_completed(future_to_text), total=len(texts)):
        results.append(future.result())

Output()

In [None]:
df["toxicity_score"] = results

In [31]:
for i, row in df.sample(5, random_state=42).iterrows():
    content = Text(row['content'], style="bold")
    toxicity = f"[yellow]Toxicity Score:[/yellow] [bold]{int(row['toxicity_score'])}[/bold]"
    label = f"[cyan]Label:[/cyan] [bold]{row['label']}[/bold]"
    panel = Panel.fit(
        f"{content}\n\n{toxicity}\n{label}",
        title=f"Exemple {i+1}",
        border_style="magenta"
    )
    console.print(panel)

## Metrics & Report        

| Metric                     | Formula                                           | Interpretation                                                                                                       |
| -------------------------- | ------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
| **Precision**              | `TP / (TP + FP)`                                  | Of the samples predicted **toxic**, how many were **actually toxic**? <br>→ High precision = **low false positives** |
| **Recall** *(Sensitivity)* | `TP / (TP + FN)`                                  | Of the **actual toxic** samples, how many did we **correctly identify**? <br>→ High recall = **low false negatives** |
| **F1-score**               | `2 * (Precision * Recall) / (Precision + Recall)` | Harmonic mean of precision and recall. <br>→ Best when **balance** is needed                                         |
| **Accuracy**               | `(TP + TN) / (TP + TN + FP + FN)`                 | Fraction of all correct predictions (toxic and non-toxic). <br>→ Can be misleading on imbalanced data                |
| **ROC AUC**                | Area under the ROC Curve                          | Measures the **ranking ability** of the classifier. <br>→ Higher = better separation of toxic vs. non-toxic          |


In [32]:
y_true = df["label"]
y_pred = df["toxicity_score"].astype(int)

In [33]:
# Rapport de classification
report = classification_report(y_true, y_pred, digits=3, output_dict=True)
table = Table(title="Classification Report", show_lines=True)
table.add_column("Classe", style="cyan", justify="center")
table.add_column("Precision", justify="center")
table.add_column("Recall", justify="center")
table.add_column("F1-score", justify="center")
table.add_column("Support", justify="center")

for label, metrics in report.items():
    if label in ["accuracy", "macro avg", "weighted avg"]:
        continue
    table.add_row(
        str(label),
        f"{metrics['precision']:.3f}",
        f"{metrics['recall']:.3f}",
        f"{metrics['f1-score']:.3f}",
        f"{int(metrics['support'])}"
    )
# Ajout de l'accuracy
table.add_row(
    "[bold yellow]Accuracy[/bold yellow]",
    "-",
    "-",
    f"{report['accuracy']:.3f}",
    "-"
)
console.print(table)

# Matrice de confusion
cm = confusion_matrix(y_true, y_pred)
cm_table = Table(title="Matrice de Confusion", show_lines=True)
cm_table.add_column(" ", style="bold")
cm_table.add_column("Prédit 0", justify="center")
cm_table.add_column("Prédit 1", justify="center")
cm_table.add_row("Réel 0", str(cm[0, 0]), str(cm[0, 1]))
cm_table.add_row("Réel 1", str(cm[1, 0]), str(cm[1, 1]))
console.print(cm_table)

# ROC AUC
try:
    auc = roc_auc_score(y_true, df["toxicity_score"])
    console.print(Panel(f"[bold green]ROC AUC Score : {auc:.3f}[/bold green]", title="ROC AUC"))
except:
    console.print(Panel("[bold red]Impossible de calculer le ROC AUC.[/bold red]", title="ROC AUC"))

## Save the predictions

In [34]:
df['toxicity_score'] = df['toxicity_score'].astype(int)
df

Unnamed: 0,msg_id,content,label,toxicity_score
0,anon_msg_468ffd36870c,AYAAAAAAAAAAAA LES CRACKHEAD ANTI MIGRANT BORD...,1.0,1
1,anon_msg_c30ce8c67bac,Bordel le gars chantait 'FN souffrance' alors ...,1.0,1
2,anon_msg_a7be822bc051,Je vois que ça ddb en masse\nComme en dictatur...,1.0,1
3,anon_msg_8a2c7da3ff85,vous tenez là ? persoent c'est chaud depuis hi...,1.0,1
4,anon_msg_b20f2e8b51ed,Le racisme est devenu le crime ultime en Occid...,0.0,1
...,...,...,...,...
1383,anon_msg_2f85a6bc5bea,"Hello, si on est myope pour bosser dans l'admi...",0.0,0
1384,anon_msg_46d5b76c86c7,« L’odeur de ta chatte nous enivre\n»,1.0,1
1385,anon_msg_6ea0073b844a,Le pouvoir que les médias ont sur les PNJ\n:ri...,0.0,1
1386,anon_msg_9e2042983c75,Élimine du survivor j’arrête les survivor de g...,0.0,1


In [36]:
df.to_csv(output_path, index=False, encoding="utf-8")