In [2]:
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [4]:
df = pd.read_csv("../Data/dataset_memes_rich.csv")  # Dataset avec colonnes "text" et "label"
# Vérification du déséquilibre des classes
print(df["label"].value_counts())

# Séparation en train/dev/test (80% - 10% - 10%)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

# Conversion en Dataset Hugging Face
train_data = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
val_data = Dataset.from_dict({"text": val_texts.tolist(), "label": val_labels.tolist()})
test_data = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})

label
1    500
0    500
Name: count, dtype: int64


In [5]:
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

# Formatage pour PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|██████████| 800/800 [00:00<00:00, 3293.71 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 4037.33 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 6345.20 examples/s]


In [6]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
class CustomTrainer(Trainer):
    def __init__(self, *args, train_dataset=None, **kwargs):
        super().__init__(*args, train_dataset=train_dataset, **kwargs)

        # Calcul du nombre de classes
        labels = [example["label"] for example in train_dataset]  # Récupérer les labels
        self.num_pos = sum(labels)  # Nombre de mèmes misogynes
        self.num_neg = len(labels) - self.num_pos  # Nombre de mèmes non misogynes

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").float()  # Convertir labels en float pour BCEWithLogitsLoss
        outputs = model(**inputs)
        logits = outputs.logits.squeeze(-1)  # Réduire à [batch_size]

        # Gestion du déséquilibre des classes
        pos_weight = torch.tensor([self.num_neg / self.num_pos]).to(logits.device)  # Ratio des classes

        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Augmentation du nombre d'époques
    learning_rate=5e-5,  # Learning rate ajusté
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
)



In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)).numpy() > 0.5).astype(int)  # Seuil à 0.5
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,  # ✅ On passe `train_dataset` pour que `num_neg` et `num_pos` soient calculés
    eval_dataset=val_data,
    compute_metrics=compute_metrics,  # ✅ On garde compute_metrics
)

trainer.train()


KeyboardInterrupt: 

In [14]:
results = trainer.evaluate(test_data)
print(results)

{'eval_loss': 1.0102797746658325, 'eval_accuracy': 0.5714285714285714, 'eval_f1': 0.5714285714285714, 'eval_precision': 0.4, 'eval_recall': 1.0, 'eval_runtime': 3.1319, 'eval_samples_per_second': 2.235, 'eval_steps_per_second': 0.319, 'epoch': 5.0}


In [None]:
model.save_pretrained("meme_misogyny_classifier")
tokenizer.save_pretrained("meme_misogyny_classifier")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("meme_misogyny_classifier")
tokenizer = AutoTokenizer.from_pretrained("meme_misogyny_classifier")

In [None]:
def predict_misogyny(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    return "Misogyne" if prediction == 1 else "Non misogyne"

sample_text = "The meme represents a panda, we can read this text on it: 'Women are bad at driving'."
print(predict_misogyny(sample_text))