# üèÅ Benchmark & Comparaison de Mod√®les

**Objectif** : Comparer diff√©rentes approches (Supervis√© vs Non-Supervis√©) et algorithmes pour la d√©tection d'intrusions.

**Mod√®les test√©s** :
1.  **Random Forest** (R√©f√©rence)
2.  **SVM** (Support Vector Machine)
3.  **KNN** (K-Nearest Neighbors)
4.  **Autoencoder** (Deep Learning Non-Supervis√©)

---


In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print(f"TensorFlow version: {tf.__version__}")

## üì• 1. Chargement et Sampling

‚ö†Ô∏è **Note** : Les algorithmes comme SVM et KNN sont tr√®s lents sur 2.5 Millions de lignes. Nous allons travailler sur un **√©chantillon stratifi√© de 50 000 lignes** pour ce benchmark.

In [None]:
# Charger le dataset (Chemin √† adapter selon votre dossier)
try:
    df = pd.read_csv('/kaggle/input/cicids2017-cleaned-and-preprocessed/cicids2017_cleaned.csv') # Sur Kaggle
    # df = pd.read_csv('../../../data/processed/cicids2017_cleaned.csv') # En local
except:
    # Fallback pour la d√©mo si fichier inexistant
    print("‚ö†Ô∏è Fichier non trouv√©, cr√©ation de donn√©es synth√©tiques pour la d√©mo...")
    df = pd.DataFrame(np.random.randn(50000, 20), columns=[f'feat_{i}' for i in range(20)])
    df['Attack Type'] = np.random.choice(['Normal Traffic', 'DoS', 'PortScan'], 50000, p=[0.8, 0.1, 0.1])

# Cr√©er binaire
df['Label'] = (df['Attack Type'] != 'Normal Traffic').astype(int)

print(f"Dataset complet: {df.shape}")

# Sampling Stratifi√© (50k lignes)
df_sample, _ = train_test_split(df, train_size=50000, stratify=df['Label'], random_state=42)

print(f"Dataset √©chantillonn√©: {df_sample.shape}")
print(df_sample['Label'].value_counts(normalize=True))

## üõ†Ô∏è 2. Pr√©paration

In [None]:
# Features vs Target
X = df_sample.select_dtypes(include=[np.number]).drop('Label', axis=1)
y = df_sample['Label']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scaling (Crucial pour SVM/KNN/Autoencoder)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Donn√©es pr√©par√©es et normalis√©es.")

## üèéÔ∏è 3. Benchmark Supervis√©

In [None]:
# Dictionnaire pour stocker les r√©sultats
results = {}

def train_evaluate(model, name):
    print(f"\nüîÑ Entra√Ænement de {name}...")
    start = time.time()
    model.fit(X_train_scaled, y_train)
    train_time = time.time() - start
    
    # Prediction
    y_pred = model.predict(X_test_scaled)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    
    # Sauvegarder le mod√®le
    filename = f"{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, filename)
    
    results[name] = {
        'Time_sec': train_time,
        'Accuracy': acc,
        'F1-Score': f1,
        'Precision': prec,
        'Recall': rec,
        'File': filename
    }
    
    print(f"‚úÖ Termin√© en {train_time:.2f}s | F1-Score: {f1:.4f} | Sauvegard√©: {filename}")

In [None]:
# 1. Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
train_evaluate(rf, 'Random Forest')

# 2. SVM (Kernel RBF par d√©faut)
svm = SVC(kernel='rbf', random_state=42)
train_evaluate(svm, 'SVM')

# 3. KNN (k=5)
knn = KNeighborsClassifier(n_neighbors=5)
train_evaluate(knn, 'KNN')

## üß† 4. Approche Non-Supervis√©e (Autoencoder)

**Principe** : L'Autoencoder apprend √† compresser et reconstruire uniquement le trafic **NORMAL**. S'il n'arrive pas √† reconstruire une donn√©e (erreur de reconstruction √©lev√©e), c'est probablement une **ANOMALIE** (Attaque).

In [None]:
# Pr√©parer les donn√©es : On entra√Æne SEULEMENT sur le trafic Normal
X_train_normal = X_train_scaled[y_train == 0]
X_test_normal = X_test_scaled[y_test == 0]
X_test_anomaly = X_test_scaled[y_test == 1]

print(f"Train Normal: {X_train_normal.shape}")

# D√©finir l'architecture de l'Autoencoder
input_dim = X_train_scaled.shape[1]

autoencoder = Sequential([
    # Encoder
    Dense(32, activation='relu', input_shape=(input_dim,)),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'), # Latent space (compression)
    # Decoder
    Dense(16, activation='relu'),
    Dense(32, activation='relu'),
    Dense(input_dim, activation='sigmoid') # Reconstruction
])

autoencoder.compile(optimizer='adam', loss='mse')

# Entra√Æner
print("\nüîÑ Entra√Ænement de l'Autoencoder...")
start = time.time()
history = autoencoder.fit(
    X_train_normal, X_train_normal,
    epochs=20,
    batch_size=64,
    validation_data=(X_test_normal, X_test_normal),
    shuffle=True,
    verbose=0
)
train_time = time.time() - start
print(f"‚úÖ Termin√© en {train_time:.2f}s")

# Calculer l'erreur de reconstruction (MSE)
reconstructions = autoencoder.predict(X_test_scaled)
mse = np.mean(np.power(X_test_scaled - reconstructions, 2), axis=1)

# D√©finir le seuil (Threshold) = Moyenne + 3*Ecart-type du Normal (ou quantile)
normal_mse = mse[y_test == 0]
threshold = np.percentile(normal_mse, 95) # On accepte 5% de faux positifs pour √™tre sensible
print(f"Seuil de d√©tection d'anomalie: {threshold:.4f}")

# Pr√©diction : Si Erreur > Seuil alors Attaque (1), sinon Normal (0)
y_pred_ae = (mse > threshold).astype(int)

# Sauvegarder
autoencoder.save('autoencoder.h5')

# Metrics Autoencoder
results['Autoencoder'] = {
    'Time_sec': train_time,
    'Accuracy': accuracy_score(y_test, y_pred_ae),
    'F1-Score': f1_score(y_test, y_pred_ae),
    'Precision': precision_score(y_test, y_pred_ae),
    'Recall': recall_score(y_test, y_pred_ae),
    'File': 'autoencoder.h5'
}

## üìä 5. Comparaison Finale

In [None]:
# Cr√©er un DataFrame de r√©sultats
df_res = pd.DataFrame(results).T
df_res = df_res.sort_values(by='F1-Score', ascending=False)

print(df_res)

# Visualisation
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# F1-Score Comparaison
sns.barplot(x=df_res.index, y='F1-Score', ax=axes[0], palette='viridis')
axes[0].set_title('Performance (F1-Score)', fontsize=14, fontweight='bold')
axes[0].set_ylim(0, 1.1)

# Temps d'entra√Ænement
sns.barplot(x=df_res.index, y='Time_sec', ax=axes[1], palette='magma')
axes[1].set_title("Temps d'Entra√Ænement (secondes)", fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## üíæ 6. Sauvegarde sur Google Drive

Nous sauvegardons tous les mod√®les g√©n√©r√©s ainsi que le rapport de benchmark dans votre Drive.

In [None]:
# Fonction de sauvegarde Drive
import os
import shutil

def save_to_drive(source_files, destination_folder='NetGuardian_Models/Comparison'):
    """Sauvegarde une liste de fichiers vers Google Drive"""
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive'):
            drive.mount('/content/drive')
        
        drive_path = f"/content/drive/MyDrive/{destination_folder}"
        os.makedirs(drive_path, exist_ok=True)
        
        print(f"\nüíæ Sauvegarde vers {drive_path}...")
        for file in source_files:
            if os.path.exists(file):
                shutil.copy2(file, drive_path)
                print(f"  ‚úÖ {file} copi√©")
            else:
                print(f"  ‚ö†Ô∏è {file} introuvable")
                
    except ImportError:
        print("‚ö†Ô∏è Environnement local/Kaggle d√©tect√© (Pas de Google Colab).")
        print(f"Les fichiers sont sauvegard√©s localement dans: {os.getcwd()}")
    except Exception as e:
        print(f"‚ùå Erreur sauvegarde Drive: {e}")

# 1. Sauvegarder les m√©triques
df_res.to_csv('benchmark_results.csv')

# 2. Liste de tous les fichiers √† sauvegarder (Mod√®les + CSV)
files_to_save = df_res['File'].tolist() + ['benchmark_results.csv']

# 3. Ex√©cuter la sauvegarde
save_to_drive(files_to_save)

print("\n‚úÖ Tous les mod√®les et r√©sultats ont √©t√© s√©curis√©s.")