# Comparaison des Modeles de Classification de Toxicite

Ce notebook compare les performances de 3 modeles:
1. **Modele Hybride** (Naive Bayes / Logistic Regression / Random Forest)
2. **XGBoost**
3. **BERT** (Deep Learning)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import f1_score, roc_auc_score, classification_report, hamming_loss
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
LABEL_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print("Imports OK")

## 1. Chargement des Donnees de Test

In [None]:
df = pd.read_csv('train.csv')
print(f"Dataset: {len(df)} commentaires")

X = df['comment_text'].values
y = df[LABEL_COLS].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE)

print(f"Train: {len(X_train)}")
print(f"Validation: {len(X_val)}")
print(f"Test: {len(X_test)}")

## 2. Chargement des Modeles
### 2.1 Modele Hybride (NB/LR/RF)

In [None]:
from hybrid_model import HybridToxicityClassifier

try:
    hybrid_model = HybridToxicityClassifier.load('toxicity_classifier.pkl')
    hybrid_model.print_config()
    HYBRID_AVAILABLE = True
except Exception as e:
    print(f"Erreur chargement modele hybride: {e}")
    HYBRID_AVAILABLE = False

### 2.2 Modele XGBoost

In [None]:
from final_model import ToxicCommentClassifier

try:
    xgb_model = ToxicCommentClassifier.load('toxic_classifier.pkl')
    print("Modele XGBoost charge avec succes")
    print(f"Seuils optimaux: {xgb_model.thresholds}")
    XGBOOST_AVAILABLE = True
except Exception as e:
    print(f"Erreur chargement modele XGBoost: {e}")
    XGBOOST_AVAILABLE = False

### 2.3 Modele BERT

In [None]:
import torch
import torch.nn as nn

BERT_AVAILABLE = False

try:
    from transformers import BertModel, BertTokenizer
    
    # Architecture identique a celle utilisee sur SageMaker
    class BertToxicClassifier(nn.Module):
        def __init__(self, num_labels=6, dropout=0.3):
            super().__init__()
            self.bert = BertModel.from_pretrained('bert-base-uncased')
            self.dropout = nn.Dropout(dropout)
            self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        
        def forward(self, input_ids, attention_mask):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.last_hidden_state[:, 0, :]
            pooled_output = self.dropout(pooled_output)
            logits = self.classifier(pooled_output)
            return logits
    
    bert_tokenizer = BertTokenizer.from_pretrained('bert_results/bert_tokenizer')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    bert_model = BertToxicClassifier(num_labels=6)
    bert_model.load_state_dict(torch.load('bert_results/bert_toxic_best.pt', map_location=device))
    bert_model.to(device)
    bert_model.eval()
    
    print(f"Modele BERT charge avec succes sur {device}")
    BERT_AVAILABLE = True
    
except Exception as e:
    print(f"Erreur chargement BERT: {e}")
    print("Utilisation des resultats SageMaker")
    BERT_AVAILABLE = False

## 3. Predictions sur le Test Set

In [None]:
results = {}
SAMPLE_SIZE = min(1000, len(X_test))
X_sample = X_test[:SAMPLE_SIZE]
y_sample = y_test[:SAMPLE_SIZE]
print(f"Evaluation sur {SAMPLE_SIZE} exemples")

### 3.1 Predictions Modele Hybride

In [None]:
if HYBRID_AVAILABLE:
    print("Predictions avec le modele Hybride...")
    hybrid_preds = hybrid_model.predict_batch(X_sample.tolist())
    y_pred_hybrid = hybrid_preds[LABEL_COLS].values
    
    results['Hybride'] = {
        'predictions': y_pred_hybrid,
        'f1_micro': f1_score(y_sample, y_pred_hybrid, average='micro'),
        'f1_macro': f1_score(y_sample, y_pred_hybrid, average='macro'),
        'hamming': hamming_loss(y_sample, y_pred_hybrid)
    }
    print(f"F1 micro: {results['Hybride']['f1_micro']:.4f}")
    print(f"F1 macro: {results['Hybride']['f1_macro']:.4f}")
else:
    print("Modele Hybride non disponible")

### 3.2 Predictions Modele XGBoost

In [None]:
if XGBOOST_AVAILABLE:
    print("Predictions avec le modele XGBoost...")
    xgb_preds = xgb_model.predict(X_sample.tolist())
    y_pred_xgb = xgb_preds[LABEL_COLS].values
    
    results['XGBoost'] = {
        'predictions': y_pred_xgb,
        'f1_micro': f1_score(y_sample, y_pred_xgb, average='micro'),
        'f1_macro': f1_score(y_sample, y_pred_xgb, average='macro'),
        'hamming': hamming_loss(y_sample, y_pred_xgb)
    }
    print(f"F1 micro: {results['XGBoost']['f1_micro']:.4f}")
    print(f"F1 macro: {results['XGBoost']['f1_macro']:.4f}")
else:
    print("Modele XGBoost non disponible")

### 3.3 Predictions Modele BERT

In [None]:
if BERT_AVAILABLE:
    print("Predictions avec le modele BERT...")
    
    def predict_bert(texts, model, tokenizer, device, batch_size=32):
        model.eval()
        all_preds = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            encoding = tokenizer(list(batch_texts), padding='max_length', truncation=True, max_length=128, return_tensors='pt')
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            with torch.no_grad():
                outputs = model(input_ids, attention_mask)
                probs = torch.sigmoid(outputs).cpu().numpy()
                preds = (probs >= 0.5).astype(int)
                all_preds.extend(preds)
        return np.array(all_preds)
    
    y_pred_bert = predict_bert(X_sample, bert_model, bert_tokenizer, device)
    results['BERT'] = {
        'predictions': y_pred_bert,
        'f1_micro': f1_score(y_sample, y_pred_bert, average='micro'),
        'f1_macro': f1_score(y_sample, y_pred_bert, average='macro'),
        'hamming': hamming_loss(y_sample, y_pred_bert)
    }
    print(f"F1 micro: {results['BERT']['f1_micro']:.4f}")
    print(f"F1 macro: {results['BERT']['f1_macro']:.4f}")
else:
    print("BERT non disponible localement - Resultats SageMaker:")
    results['BERT'] = {
        'f1_micro': 0.7947,
        'f1_macro': 0.6774,
        'hamming': 0.0149,
        'f1_per_label': {'toxic': 0.8348, 'severe_toxic': 0.4469, 'obscene': 0.8284, 'threat': 0.6263, 'insult': 0.7824, 'identity_hate': 0.5455}
    }
    print(f"F1 micro: {results['BERT']['f1_micro']:.4f}")
    print(f"F1 macro: {results['BERT']['f1_macro']:.4f}")

## 4. Comparaison des Resultats

In [None]:
print("=" * 70)
print("COMPARAISON DES MODELES")
print("=" * 70)
print(f"\n{'Modele':<20} {'F1 Micro':<12} {'F1 Macro':<12} {'Hamming Loss'}")
print("-" * 60)

for model_name, metrics in results.items():
    print(f"{model_name:<20} {metrics['f1_micro']:<12.4f} {metrics['f1_macro']:<12.4f} {metrics['hamming']:.4f}")

best_model = max(results.items(), key=lambda x: x[1]['f1_micro'])
print(f"\nMeilleur modele (F1 micro): {best_model[0]} ({best_model[1]['f1_micro']:.4f})")

## 5. Visualisation

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
models = list(results.keys())
colors = ['#3498db', '#e74c3c', '#2ecc71']

# F1 Micro
ax1 = axes[0]
f1_micro_values = [results[m]['f1_micro'] for m in models]
bars1 = ax1.bar(models, f1_micro_values, color=colors[:len(models)])
ax1.set_ylabel('Score')
ax1.set_title('F1 Score Micro')
ax1.set_ylim(0, 1)
for bar, val in zip(bars1, f1_micro_values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{val:.3f}', ha='center')

# F1 Macro
ax2 = axes[1]
f1_macro_values = [results[m]['f1_macro'] for m in models]
bars2 = ax2.bar(models, f1_macro_values, color=colors[:len(models)])
ax2.set_ylabel('Score')
ax2.set_title('F1 Score Macro')
ax2.set_ylim(0, 1)
for bar, val in zip(bars2, f1_macro_values):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{val:.3f}', ha='center')

# Hamming Loss
ax3 = axes[2]
hamming_values = [results[m]['hamming'] for m in models]
bars3 = ax3.bar(models, hamming_values, color=colors[:len(models)])
ax3.set_ylabel('Loss')
ax3.set_title('Hamming Loss (plus bas = mieux)')
for bar, val in zip(bars3, hamming_values):
    ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002, f'{val:.4f}', ha='center')

plt.tight_layout()
plt.savefig('comparaison_modeles.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. F1 Score par Label

In [None]:
f1_per_label = {}

for model_name, data in results.items():
    if 'predictions' in data:
        f1_per_label[model_name] = {}
        for i, label in enumerate(LABEL_COLS):
            f1 = f1_score(y_sample[:, i], data['predictions'][:, i], zero_division=0)
            f1_per_label[model_name][label] = f1
    elif 'f1_per_label' in data:
        f1_per_label[model_name] = data['f1_per_label']

print("\n" + "=" * 80)
print("F1 SCORE PAR LABEL")
print("=" * 80)
header = f"{'Label':<15}" + "".join([f"{m:<15}" for m in f1_per_label.keys()])
print(header)
print("-" * len(header))

for label in LABEL_COLS:
    row = f"{label:<15}"
    for model_name in f1_per_label.keys():
        row += f"{f1_per_label[model_name].get(label, 0):<15.4f}"
    print(row)

In [None]:
if f1_per_label:
    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(len(LABEL_COLS))
    width = 0.25
    
    for i, (model_name, scores) in enumerate(f1_per_label.items()):
        values = [scores.get(label, 0) for label in LABEL_COLS]
        offset = (i - len(f1_per_label)/2 + 0.5) * width
        ax.bar(x + offset, values, width, label=model_name, color=colors[i])
    
    ax.set_ylabel('F1 Score')
    ax.set_title('F1 Score par Label et par Modele')
    ax.set_xticks(x)
    ax.set_xticklabels(LABEL_COLS, rotation=45, ha='right')
    ax.legend()
    ax.set_ylim(0, 1)
    ax.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.savefig('f1_par_label_comparaison.png', dpi=150, bbox_inches='tight')
    plt.show()

## 7. Conclusions

In [None]:
print("""
======================================================================
CONCLUSIONS
======================================================================

RESUME DES MODELES:

1. MODELE HYBRIDE (NB/LR/RF)
   - Rapide et interpretable
   - Peu de ressources necessaires
   - Performance limitee sur classes rares

2. XGBOOST
   - Bon equilibre performance/vitesse
   - Necessite TF-IDF (feature engineering)
   - Seuils optimisables par label

3. BERT (Deep Learning)
   - Meilleure performance globale
   - Comprend le contexte semantique
   - Necessite GPU et plus de ressources

RECOMMANDATIONS:
- Production avec contraintes de latence: Hybride ou XGBoost
- Meilleure precision: BERT
- Bon compromis: XGBoost avec seuils optimises
""")

## 8. Test sur des Exemples

In [None]:
test_comments = [
    "You are an idiot and I hate you!",
    "This is a great article, thanks for sharing!",
    "I will find you and hurt you",
    "What a stupid and ugly person",
    "Thanks for the helpful information"
]

print("=" * 80)
print("TEST SUR DES EXEMPLES")
print("=" * 80)

for comment in test_comments:
    print(f"\nCommentaire: \"{comment}\"")
    print("-" * 60)
    
    if HYBRID_AVAILABLE:
        pred = hybrid_model.predict(comment)
        labels = [l for l, v in pred.items() if v == 1]
        print(f"  Hybride: {labels if labels else 'Non toxique'}")
    
    if XGBOOST_AVAILABLE:
        pred = xgb_model.predict(comment)
        labels = [l for l in LABEL_COLS if pred.iloc[0][l] == 1]
        print(f"  XGBoost: {labels if labels else 'Non toxique'}")