In [3]:
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
df = pd.read_csv("dataset_memes.csv")  # Dataset avec colonnes "text" et "label"

# Séparation en train/dev/test (80% - 10% - 10%)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

In [None]:
train_data = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
val_data = Dataset.from_dict({"text": val_texts.tolist(), "label": val_labels.tolist()})
test_data = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})


In [None]:
MODEL_NAME = "bert-base-uncased"  # Peut être remplacé par RoBERTa, DeBERTa...
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,  # Ajustable
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,  # Stocke seulement les 2 meilleurs modèles
    load_best_model_at_end=True,
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
results = trainer.evaluate(test_data)
print(results)

In [None]:
model.save_pretrained("meme_misogyny_classifier")
tokenizer.save_pretrained("meme_misogyny_classifier")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("meme_misogyny_classifier")
tokenizer = AutoTokenizer.from_pretrained("meme_misogyny_classifier")

In [None]:
def predict_misogyny(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    return "Misogyne" if prediction == 1 else "Non misogyne"

sample_text = "The meme represents a panda, we can read this text on it: 'Women are bad at driving'."
print(predict_misogyny(sample_text))