# Classification de Toxicite avec RoBERTa - Amazon SageMaker

RoBERTa (Robustly Optimized BERT Pretraining Approach) est une version amelioree de BERT par Facebook/Meta.

**Ameliorations par rapport a BERT:**
- Entraine plus longtemps avec plus de donnees
- Suppression de la tache NSP (Next Sentence Prediction)
- Sequences plus longues
- Masquage dynamique

**Labels de toxicite:**
- toxic, severe_toxic, obscene, threat, insult, identity_hate

## 1. Configuration et Imports

In [None]:
# Installation des dependances
!pip install transformers datasets accelerate -q

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import RobertaModel, RobertaTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, hamming_loss, roc_auc_score
from tqdm import tqdm
import time
import os
import warnings
warnings.filterwarnings('ignore')

# Configuration
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

LABEL_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print("Imports OK")

## 2. Verification GPU

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memoire GPU: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("ATTENTION: Pas de GPU detecte. L'entrainement sera tres lent.")

## 3. Hyperparametres

In [None]:
# Hyperparametres
USE_FULL_DATASET = True
SAMPLE_SIZE = 50000  # Si USE_FULL_DATASET = False

MAX_LENGTH = 128
BATCH_SIZE = 16  # RoBERTa est plus grand, reduire le batch size
EPOCHS = 3
LEARNING_RATE = 1e-5  # Plus petit que BERT
WARMUP_RATIO = 0.1
DROPOUT = 0.3

MODEL_NAME = 'roberta-base'  # Ou 'roberta-large' pour plus de performance

print("Configuration:")
print(f"  Modele: {MODEL_NAME}")
print(f"  Dataset complet: {USE_FULL_DATASET}")
print(f"  Max length: {MAX_LENGTH}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")

## 4. Chargement des Donnees

In [None]:
print("Chargement des donnees...")
df = pd.read_csv('train.csv')
print(f"Dataset original: {len(df)} lignes")

# Echantillonnage si necessaire
if not USE_FULL_DATASET:
    df = df.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE)
    print(f"Echantillon: {len(df)} lignes")

# Afficher distribution
print("\nDistribution des labels:")
for col in LABEL_COLS:
    pct = df[col].mean() * 100
    print(f"  {col}: {pct:.2f}%")

In [None]:
# Split des donnees
X = df['comment_text'].values
y = df[LABEL_COLS].values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE
)

print(f"\nSplit des donnees:")
print(f"  Train: {len(X_train)}")
print(f"  Validation: {len(X_val)}")
print(f"  Test: {len(X_test)}")

## 5. Tokenizer RoBERTa

In [None]:
print(f"Chargement du tokenizer {MODEL_NAME}...")
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

# Test
test_text = "This is a test sentence for RoBERTa tokenizer."
tokens = tokenizer(test_text, padding='max_length', truncation=True, max_length=MAX_LENGTH)
print(f"\nTest tokenisation:")
print(f"  Texte: {test_text}")
print(f"  Nombre de tokens: {len([t for t in tokens['input_ids'] if t != tokenizer.pad_token_id])}")

## 6. Dataset PyTorch

In [None]:
class ToxicDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Creer les datasets
train_dataset = ToxicDataset(X_train, y_train, tokenizer, MAX_LENGTH)
val_dataset = ToxicDataset(X_val, y_val, tokenizer, MAX_LENGTH)
test_dataset = ToxicDataset(X_test, y_test, tokenizer, MAX_LENGTH)

# Creer les dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"DataLoaders crees:")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches: {len(val_loader)}")
print(f"  Test batches: {len(test_loader)}")

## 7. Modele RoBERTa

In [None]:
class RobertaToxicClassifier(nn.Module):
    def __init__(self, model_name, num_labels=6, dropout=0.3):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Utiliser le token <s> (equivalent de [CLS])
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Creer le modele
print(f"Chargement du modele {MODEL_NAME}...")
model = RobertaToxicClassifier(MODEL_NAME, num_labels=6, dropout=DROPOUT)
model.to(device)

# Compter les parametres
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nParametres du modele:")
print(f"  Total: {total_params:,}")
print(f"  Entrainables: {trainable_params:,}")

## 8. Configuration de l'Entrainement

In [None]:
# Loss function
criterion = nn.BCEWithLogitsLoss()

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

# Scheduler
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print(f"Configuration entrainement:")
print(f"  Total steps: {total_steps}")
print(f"  Warmup steps: {warmup_steps}")

## 9. Fonctions d'Entrainement

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(dataloader, desc="Training")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            probs = torch.sigmoid(outputs).cpu().numpy()
            preds = (probs >= 0.5).astype(int)
            
            all_probs.extend(probs)
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    all_probs = np.array(all_probs)
    
    f1_micro = f1_score(all_labels, all_preds, average='micro')
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    
    return {
        'loss': total_loss / len(dataloader),
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'predictions': all_preds,
        'labels': all_labels,
        'probabilities': all_probs
    }

## 10. Entrainement

In [None]:
print("=" * 70)
print("ENTRAINEMENT DU MODELE ROBERTA")
print("=" * 70)
print(f"Dataset: {len(X_train)} train / {len(X_val)} val / {len(X_test)} test")
print(f"Epochs: {EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print("=" * 70)

history = {
    'train_loss': [],
    'val_loss': [],
    'val_f1_micro': [],
    'val_f1_macro': []
}

best_f1 = 0
best_model_state = None
start_time = time.time()

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 50)
    
    # Train
    train_loss = train_epoch(model, train_loader, criterion, optimizer, scheduler, device)
    history['train_loss'].append(train_loss)
    
    # Validate
    val_results = evaluate(model, val_loader, criterion, device)
    history['val_loss'].append(val_results['loss'])
    history['val_f1_micro'].append(val_results['f1_micro'])
    history['val_f1_macro'].append(val_results['f1_macro'])
    
    print(f"\nTrain Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_results['loss']:.4f}")
    print(f"Val F1 micro: {val_results['f1_micro']:.4f}")
    print(f"Val F1 macro: {val_results['f1_macro']:.4f}")
    
    # Sauvegarder le meilleur modele
    if val_results['f1_micro'] > best_f1:
        best_f1 = val_results['f1_micro']
        best_model_state = model.state_dict().copy()
        print(f"Nouveau meilleur modele! F1 micro: {best_f1:.4f}")
    
    # Temps ecoule
    elapsed = time.time() - start_time
    print(f"Temps ecoule: {elapsed/60:.1f} minutes")

total_time = time.time() - start_time
print(f"\nEntrainement termine en {total_time/60:.1f} minutes")

## 11. Sauvegarde du Meilleur Modele

In [None]:
# Charger le meilleur modele
model.load_state_dict(best_model_state)

# Sauvegarder
torch.save(model.state_dict(), 'roberta_toxic_best.pt')
print("Meilleur modele sauvegarde: roberta_toxic_best.pt")

# Sauvegarder le tokenizer
tokenizer.save_pretrained('roberta_tokenizer')
print("Tokenizer sauvegarde: roberta_tokenizer/")

## 12. Evaluation sur le Test Set

In [None]:
print("=" * 70)
print("EVALUATION SUR LE TEST SET")
print("=" * 70)

test_results = evaluate(model, test_loader, criterion, device)

print(f"\nTest Loss: {test_results['loss']:.4f}")
print(f"Test F1 micro: {test_results['f1_micro']:.4f}")
print(f"Test F1 macro: {test_results['f1_macro']:.4f}")
print(f"Hamming Loss: {hamming_loss(test_results['labels'], test_results['predictions']):.4f}")

In [None]:
# Metriques par label
print("\n" + "=" * 70)
print("METRIQUES PAR LABEL")
print("=" * 70)

print(f"\n{'Label':<15} {'F1':<10} {'Precision':<10} {'Recall':<10} {'Support'}")
print("-" * 55)

for i, label in enumerate(LABEL_COLS):
    y_true = test_results['labels'][:, i]
    y_pred = test_results['predictions'][:, i]
    
    f1 = f1_score(y_true, y_pred, zero_division=0)
    precision = np.sum((y_pred == 1) & (y_true == 1)) / max(np.sum(y_pred == 1), 1)
    recall = np.sum((y_pred == 1) & (y_true == 1)) / max(np.sum(y_true == 1), 1)
    support = np.sum(y_true == 1)
    
    print(f"{label:<15} {f1:<10.4f} {precision:<10.4f} {recall:<10.4f} {support}")

## 13. Visualisation

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss
ax1 = axes[0]
ax1.plot(history['train_loss'], label='Train Loss', marker='o')
ax1.plot(history['val_loss'], label='Val Loss', marker='s')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training & Validation Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# F1 Scores
ax2 = axes[1]
ax2.plot(history['val_f1_micro'], label='F1 Micro', marker='o')
ax2.plot(history['val_f1_macro'], label='F1 Macro', marker='s')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('F1 Score')
ax2.set_title('Validation F1 Scores')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('roberta_training_history.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# F1 par label
f1_scores = []
for i, label in enumerate(LABEL_COLS):
    f1 = f1_score(test_results['labels'][:, i], test_results['predictions'][:, i], zero_division=0)
    f1_scores.append(f1)

plt.figure(figsize=(10, 6))
colors = ['#e74c3c' if f < 0.5 else '#f39c12' if f < 0.7 else '#2ecc71' for f in f1_scores]
bars = plt.bar(LABEL_COLS, f1_scores, color=colors)
plt.ylabel('F1 Score')
plt.title('RoBERTa - F1 Score par Label')
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1)
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)

for bar, score in zip(bars, f1_scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
             f'{score:.3f}', ha='center', fontsize=10)

plt.tight_layout()
plt.savefig('roberta_f1_by_label.png', dpi=150, bbox_inches='tight')
plt.show()

## 14. Test sur des Exemples

In [None]:
def predict_text(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probs = torch.sigmoid(outputs).cpu().numpy()[0]
    
    return {label: float(prob) for label, prob in zip(LABEL_COLS, probs)}

# Tests
test_texts = [
    "You are such an idiot, I hate you!",
    "Thank you for this helpful article!",
    "I will find you and hurt you badly",
    "This is a normal comment about the weather",
    "You stupid ugly moron, go to hell"
]

print("=" * 70)
print("TEST SUR DES EXEMPLES")
print("=" * 70)

for text in test_texts:
    print(f"\nTexte: \"{text}\"")
    preds = predict_text(text, model, tokenizer, device)
    toxic_labels = [(l, p) for l, p in preds.items() if p > 0.5]
    if toxic_labels:
        print("  Labels detectes:")
        for label, prob in toxic_labels:
            print(f"    - {label}: {prob:.2%}")
    else:
        print("  Non toxique")

## 15. Archivage des Resultats

In [None]:
import shutil

# Creer le dossier results
os.makedirs('roberta_results', exist_ok=True)

# Copier les fichiers
shutil.copy('roberta_toxic_best.pt', 'roberta_results/')
shutil.copy('roberta_training_history.png', 'roberta_results/')
shutil.copy('roberta_f1_by_label.png', 'roberta_results/')

if os.path.exists('roberta_tokenizer'):
    shutil.copytree('roberta_tokenizer', 'roberta_results/roberta_tokenizer', dirs_exist_ok=True)

# Sauvegarder les metriques
metrics = {
    'model': MODEL_NAME,
    'test_f1_micro': test_results['f1_micro'],
    'test_f1_macro': test_results['f1_macro'],
    'test_loss': test_results['loss'],
    'hamming_loss': hamming_loss(test_results['labels'], test_results['predictions']),
    'epochs': EPOCHS,
    'batch_size': BATCH_SIZE,
    'learning_rate': LEARNING_RATE,
    'training_time_minutes': total_time / 60
}

import json
with open('roberta_results/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

# Creer l'archive
shutil.make_archive('roberta_results', 'zip', 'roberta_results')
print("Archive creee: roberta_results.zip")
print("Telechargez ce fichier depuis SageMaker.")

## 16. Comparaison BERT vs RoBERTa

Apres avoir execute ce notebook, comparez les resultats avec BERT:

| Metrique | BERT | RoBERTa |
|----------|------|--------|
| F1 Micro | 0.7947 | ? |
| F1 Macro | 0.6774 | ? |
| Hamming Loss | 0.0149 | ? |

RoBERTa devrait generalement obtenir des resultats legerement meilleurs que BERT.