In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import default_data_collator

In [2]:
# Charger les données
train_df = pd.read_csv("../preprocessing/train.csv")
test_df = pd.read_csv("../preprocessing/test.csv")

# Nettoyage des données
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [3]:
# Tokenizer de DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [4]:
class MemeDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding="max_length", truncation=True, max_length=64, return_tensors="pt")

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [5]:
# Vérifier si CUDA est dispo, sinon forcer CPU
device = torch.device("cpu")

# Préparer les datasets
train_dataset = MemeDataset(train_df["text"].tolist(), train_df["label"].tolist())
test_dataset = MemeDataset(test_df["text"].tolist(), test_df["label"].tolist())

# Charger le modèle DistilBERT (sur CPU)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)

print(train_dataset[0])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([ 101, 6501, 6501, 5831, 2361,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor(0)}


In [7]:
# Configuration de l'entraînement optimisée pour CPU
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # Réduire à 1 époque pour éviter un entraînement trop long
    per_device_train_batch_size=4,  # Réduire pour éviter saturation mémoire
    per_device_eval_batch_size=4,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir="./logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    use_cpu=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=default_data_collator  # Ajout de cette ligne
)

# Entraînement du modèle (cela prendra du temps sur CPU)
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5443,1.044397
2,0.4744,1.267614
3,0.2659,1.714216


TrainOutput(global_step=7500, training_loss=0.42650692036946614, metrics={'train_runtime': 5765.1255, 'train_samples_per_second': 5.204, 'train_steps_per_second': 1.301, 'total_flos': 496752744960000.0, 'train_loss': 0.42650692036946614, 'epoch': 3.0})

In [8]:
# Évaluation
results = trainer.evaluate()
print(results)

# Sauvegarde du modèle
model.save_pretrained("distilbert-misogyny-detector")
tokenizer.save_pretrained("distilbert-misogyny-detector")

{'eval_loss': 1.7142155170440674, 'eval_runtime': 32.0172, 'eval_samples_per_second': 31.233, 'eval_steps_per_second': 7.808, 'epoch': 3.0}


('distilbert-misogyny-detector\\tokenizer_config.json',
 'distilbert-misogyny-detector\\special_tokens_map.json',
 'distilbert-misogyny-detector\\vocab.txt',
 'distilbert-misogyny-detector\\added_tokens.json')

In [9]:
import evaluate

# Charger la métrique d'accuracy
metric = evaluate.load("accuracy")
# Fonction de calcul des métriques pour Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Réassigner la fonction au Trainer
trainer.compute_metrics = compute_metrics

# Évaluer le modèle sur le dataset de test
results = trainer.evaluate()
print("Résultats de l'évaluation :", results)

Résultats de l'évaluation : {'eval_loss': 1.7142155170440674, 'eval_accuracy': 0.644, 'eval_runtime': 30.5757, 'eval_samples_per_second': 32.706, 'eval_steps_per_second': 8.176, 'epoch': 3.0}


In [10]:
import numpy as np

# Obtenir les prédictions sur le test set
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
y_pred = np.argmax(logits, axis=-1)

# Afficher la distribution des classes prédites
unique, counts = np.unique(y_pred, return_counts=True)
class_distribution = dict(zip(unique, counts))

print("Distribution des prédictions :", class_distribution)


Distribution des prédictions : {np.int64(0): np.int64(302), np.int64(1): np.int64(698)}


In [14]:
# Compter le nombre d'exemples dans chaque classe dans le dataset d'entraînement
train_label_counts = train_df["label"].value_counts()
print("Répartition des labels dans le dataset d'entraînement :\n", train_label_counts)


Répartition des labels dans le dataset d'entraînement :
 label
0    5000
1    5000
Name: count, dtype: int64
