---


In [1]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_recall_fscore_support,
    roc_auc_score,
)

import mlflow

ROOT = Path("..").resolve()
DATA_PATH = ROOT / "data"
OUT_PATH = ROOT / "out"
SCRIPTS_PATH = ROOT / "scripts"

sys.path.append(str(SCRIPTS_PATH))

from preprocessing import preprocess_bert

tqdm.pandas(desc="Preprocessing (BERT)")
pd.set_option("display.max_colwidth", 200)

mlflow.set_tracking_uri(f"file:{ROOT / 'mlruns'}")
mlflow.set_experiment("sentiment_airparadis_bert")

model_id = "answerdotai/ModernBERT-base"
device = "cuda" if torch.cuda.is_available() else "cpu"
device

  from .autonotebook import tqdm as notebook_tqdm


OSError: [WinError 1114] Une routine d’initialisation d’une bibliothèque de liens dynamiques (DLL) a échoué. Error loading "c:\Users\Gui\Desktop\AAA_doc\Openclassroom school\Python project\proj_proj\proj7\env2\Lib\site-packages\torch\lib\c10.dll" or one of its dependencies.

In [None]:
col_names = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(
    DATA_PATH / "training.1600000.processed.noemoticon.csv",
    encoding="latin-1",
    header=None,
    names=col_names,
)

df["label"] = (df["target"] == 4).astype(int)

df = df.reset_index().rename(columns={"index": "row_id"})

split = pd.read_csv(OUT_PATH / "split.csv")

df = df.merge(split, left_on="row_id", right_on="ids", how="inner")

df[["row_id", "split"]].head(), len(df)

(   row_id  split
 0       0  train
 1       1  train
 2       2  train
 3       3   test
 4       4  train,
 1527316)

In [None]:
df["text_bert"] = df["text"].progress_apply(preprocess_bert)

df_train = df[df["split"] == "train"].copy()
df_test = df[df["split"] == "test"].copy()


train_sample_size = 50_000

if len(df_train) > train_sample_size:
    df_train = df_train.sample(train_sample_size, random_state=42)
    print(f"Train réduit à {len(df_train)} ModernBERT.")
else:
    print(f"Train complet utilisé ({len(df_train)} exemples).")


X_train_text = df_train["text_bert"].astype(str).tolist()
X_test_text = df_test["text_bert"].astype(str).tolist()
y_train = df_train["label"].values
y_test = df_test["label"].values

len(X_train_text), len(X_test_text)

Preprocessing (BERT): 100%|██████████| 1527316/1527316 [00:09<00:00, 161758.15it/s]


Train réduit à 50000 ModernBERT.


(50000, 305464)

In [None]:
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained(model_id)

max_length = 64

train_df_hf = pd.DataFrame({"text": X_train_text, "label": y_train})
test_df_hf = pd.DataFrame({"text": X_test_text, "label": y_test})

train_ds = Dataset.from_pandas(train_df_hf)
test_ds = Dataset.from_pandas(test_df_hf)


def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )


train_tokenized = train_ds.map(tokenize_function, batched=True)
test_tokenized = test_ds.map(tokenize_function, batched=True)


def prepare_for_torch(ds):
    cols_to_remove = [c for c in ["text", "__index_level_0__"] if c in ds.column_names]
    ds = ds.remove_columns(cols_to_remove)
    ds.set_format("torch")
    return ds


train_tokenized = prepare_for_torch(train_tokenized)
test_tokenized = prepare_for_torch(test_tokenized)


eval_size = 50_000

if len(test_tokenized) > eval_size:
    rng = np.random.default_rng(42)
    indices = rng.choice(len(test_tokenized), size=eval_size, replace=False)
    test_eval = test_tokenized.select(indices.tolist())
    print(
        f"Test réduit à {len(test_eval)} exemples pour ModernBERT (sur {len(test_tokenized)})."
    )
else:
    test_eval = test_tokenized
    print(f"Test complet utilisé ({len(test_eval)} exemples).")


train_tokenized[0]

Map: 100%|██████████| 50000/50000 [00:02<00:00, 16868.68 examples/s]
Map: 100%|██████████| 305464/305464 [00:14<00:00, 20766.33 examples/s]

Test réduit à 50000 exemples pour ModernBERT (sur 305464).





{'label': tensor(1),
 'input_ids': tensor([50281,  1147,   434,  9902,   272,   751,   330,  5214,    59,    58,
           987,  1024,     2,   309,  2389,   352,  1969,   816,  3524,   627,
           403,   642, 15070,  8585,     2, 50282, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283, 50283,
         50283, 50283, 50283, 50283]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [None]:
num_labels = 2

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels
).to(device)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )

    if logits.shape[1] == 2:
        proba_1 = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
        roc_auc = roc_auc_score(labels, proba_1)
    else:
        roc_auc = float("nan")

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
    }

In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 512
epochs = 3

training_args = TrainingArguments(
    output_dir=str(OUT_PATH / "modernbert"),
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_eval,  # testing (test_tokenized pour les 300k)
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
with mlflow.start_run(run_name="modernbert_base"):

    mlflow.log_param("model_id", model_id)
    mlflow.log_param("max_length", max_length)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("epochs", epochs)

    trainer.train()

    eval_metrics = trainer.evaluate()

    for k, v in eval_metrics.items():
        if isinstance(v, (int, float, np.floating)):
            mlflow.log_metric(f"test_{k}", float(v))

    save_dir = OUT_PATH / "modernbert_model"
    trainer.save_model(str(save_dir))
    tokenizer.save_pretrained(str(save_dir))

    mlflow.log_artifacts(str(save_dir), artifact_path="model")

eval_metrics

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


2025/12/01 15:39:01 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 3c865f091fc0404ab81151673c8effa7: Failed to log run data: Exception: Changing param values is not allowed. Param with key='max_length' was already logged with value='64' for run ID='3c865f091fc0404ab81151673c8effa7'. Attempted logging new value '20'.


Step,Training Loss




{'eval_loss': 0.35788649320602417,
 'eval_accuracy': 0.84474,
 'eval_precision': 0.842803865741669,
 'eval_recall': 0.8454143201930813,
 'eval_f1': 0.8441070747233769,
 'eval_roc_auc': 0.9217781821637926,
 'eval_runtime': 1961.3742,
 'eval_samples_per_second': 25.492,
 'eval_steps_per_second': 0.05,
 'epoch': 3.0}