In [12]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import default_data_collator

In [13]:
# Charger les données
train_df = pd.read_csv("../preprocessing/train.csv")
test_df = pd.read_csv("../preprocessing/test.csv")

# Nettoyage des données
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [14]:
# Tokenizer de DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [15]:
class MemeDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding="max_length", truncation=True, max_length=64, return_tensors="pt")

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [16]:
# Vérifier si CUDA est dispo, sinon forcer CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Préparer les datasets
train_dataset = MemeDataset(train_df["text"].tolist(), train_df["label"].tolist())
test_dataset = MemeDataset(test_df["text"].tolist(), test_df["label"].tolist())

# Charger le modèle DistilBERT (sur le device détecté)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)

print(train_dataset[0])

cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda:0
{'input_ids': tensor([ 101, 6501, 6501, 5831, 2361,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'labels': tensor(0)}


In [18]:
# Configuration de l'entraînement optimisée pour CPU
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,  # Réduire à 1 époque pour éviter un entraînement trop long
    per_device_train_batch_size=4,  # Réduire pour éviter saturation mémoire
    per_device_eval_batch_size=4,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir="./logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    use_cpu=True if device == "cpu" else False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=default_data_collator  # Ajout de cette ligne
)

# Entraînement du modèle (cela prendra du temps sur CPU)
trainer.train()

cuda:0
cuda:0


Epoch,Training Loss,Validation Loss
1,0.539,1.034594
2,0.4206,1.351465
3,0.2641,1.779183
4,0.1644,1.987231
5,0.0787,2.318635


TrainOutput(global_step=12500, training_loss=0.2816547351074219, metrics={'train_runtime': 512.672, 'train_samples_per_second': 97.528, 'train_steps_per_second': 24.382, 'total_flos': 827921241600000.0, 'train_loss': 0.2816547351074219, 'epoch': 5.0})

In [19]:
# Évaluation
results = trainer.evaluate()
print(results)

# Sauvegarde du modèle
model.save_pretrained("distilbert-misogyny-detector")
tokenizer.save_pretrained("distilbert-misogyny-detector")

{'eval_loss': 2.3186354637145996, 'eval_runtime': 1.7964, 'eval_samples_per_second': 556.68, 'eval_steps_per_second': 139.17, 'epoch': 5.0}


('distilbert-misogyny-detector\\tokenizer_config.json',
 'distilbert-misogyny-detector\\special_tokens_map.json',
 'distilbert-misogyny-detector\\vocab.txt',
 'distilbert-misogyny-detector\\added_tokens.json')

In [21]:
import evaluate

# Charger la métrique d'accuracy
metric = evaluate.load("accuracy")
# Fonction de calcul des métriques pour Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Réassigner la fonction au Trainer
trainer.compute_metrics = compute_metrics

# Évaluer le modèle sur le dataset de test
results = trainer.evaluate()
print("Résultats de l'évaluation :", results)

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<?, ?B/s]


Résultats de l'évaluation : {'eval_loss': 2.3186354637145996, 'eval_accuracy': 0.649, 'eval_runtime': 2.7041, 'eval_samples_per_second': 369.815, 'eval_steps_per_second': 92.454, 'epoch': 5.0}


In [24]:
import numpy as np

# Obtenir les prédictions sur le test set
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
y_pred = np.argmax(logits, axis=-1)

# Afficher la distribution des classes prédites
unique, counts = np.unique(y_pred, return_counts=True)
class_distribution = dict(zip(unique, counts))

print("Distribution des prédictions :", class_distribution)


AttributeError: 'dict' object has no attribute 'tolist'

In [23]:
# Compter le nombre d'exemples dans chaque classe dans le dataset d'entraînement
train_label_counts = train_df["label"].value_counts()
print("Répartition des labels dans le dataset d'entraînement :\n", train_label_counts)


Répartition des labels dans le dataset d'entraînement :
 label
0    5000
1    5000
Name: count, dtype: int64
