## P9_sentiment_boosting.ipynb

# 1. Imports et Setup

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# ⚡ Setup GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"✅ Pytorch Device: {device}")

# Optionnel : fix random seed

In [None]:
# torch.manual_seed(70)

# 2. Chargement du Dataset
# (à adapter avec  fichier nettoyé si besoin)

In [None]:
# Exemple :
# df = pd.read_csv('tweets_cleaned.csv')
# dataset = Dataset.from_pandas(df[['text', 'label']])

# 3. Preprocessing et Tokenization

In [None]:
# Exemple :
tokenizer_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

## 3.1 - Fonction de tokenization

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

## 3.2 - Tokenisation

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 4. Création des datasets train/test

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, stratify_by_column='label')
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

# 5. Baseline : Fine-tuning DistilBERT

In [None]:
model_baseline = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [None]:
training_args_baseline = TrainingArguments(
    output_dir='./results_baseline',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_baseline',
    load_best_model_at_end=True,
)

In [None]:
trainer_baseline = Trainer(
    model=model_baseline,
    args=training_args_baseline,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
                                "f1": f1_score(p.label_ids, np.argmax(p.predictions, axis=1))}
)

In [None]:
trainer_baseline.train()

# 6. Fine-tuning DeBERTaV3 Small

In [None]:
model_deberta = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-small', num_labels=2)

In [None]:
training_args_deberta = TrainingArguments(
    output_dir='./results_deberta',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_deberta',
    load_best_model_at_end=True,
)

In [None]:
trainer_deberta = Trainer(
    model=model_deberta,
    args=training_args_deberta,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
                                "f1": f1_score(p.label_ids, np.argmax(p.predictions, axis=1))}
)

In [None]:
trainer_deberta.train()

# 7. Fine-tuning MiniLMv2

In [None]:
model_minilm = AutoModelForSequenceClassification.from_pretrained('nreimers/MiniLMv2-L6-H384-distilled-from-RoBERTa-Large', num_labels=2)

In [None]:
training_args_minilm = TrainingArguments(
    output_dir='./results_minilm',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_minilm',
    load_best_model_at_end=True,
)

In [None]:
trainer_minilm = Trainer(
    model=model_minilm,
    args=training_args_minilm,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
                                "f1": f1_score(p.label_ids, np.argmax(p.predictions, axis=1))}
)


In [None]:
trainer_minilm.train()

# 8. Comparaison des résultats

In [None]:
# (Accuracy, F1-score, Temps CPU)

# 9. Techniques d'optimisation (optionnel)

In [None]:
# - Quantization
# - Pruning

# 10. Visualisations et Export des Résultats

In [None]:
# (pour dashboard final)

# 11. Conclusion

In [None]:
# (Résumer ce qui a été observé, points forts/faibles)

# 📄 Fin du notebook
# ➡️ A compléter avec :
# - Temps d'entraînement pour chaque modèle
# - Comparaison visuelle (barplot)
# - Interprétabilité (ex: LIME)
# - Bonus : Quantization possible sur DeBERTaV3 Small