# üéØ NetGuardian-AI : Approche Hybride

**Objectif** : Impl√©menter un syst√®me de d√©tection hybride avec 2 mod√®les en cascade

- **Mod√®le 1** : D√©tection binaire (Normal vs Attaque)
- **Mod√®le 2** : Classification multi-classes (Type d'attaque)

---

## üì¶ Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import joblib
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

ModuleNotFoundError: No module named 'pandas'

## üìÇ Charger le Dataset

In [None]:
# Charger le dataset pr√©-nettoy√©
df = pd.read_csv('/kaggle/input/cicids2017-cleaned-and-preprocessed/cicids2017_cleaned.csv')

print(f"Shape: {df.shape}")
print(f"\nDistribution des classes:")
print(df['Attack Type'].value_counts())

## üßπ Nettoyage Rapide

In [None]:
# Remplacer infinis et NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remplir NaN avec m√©diane
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

# Supprimer duplications
df = df.drop_duplicates()

print(f"‚úÖ Dataset nettoy√©: {df.shape}")

## üè∑Ô∏è Cr√©er les 2 Types de Labels

In [None]:
# 1. Label Binaire : 0 = Normal, 1 = Attaque
df['Binary_Label'] = (df['Attack Type'] != 'Normal Traffic').astype(int)

print("Distribution binaire:")
print(df['Binary_Label'].value_counts())
print(f"\nPourcentages:")
print(df['Binary_Label'].value_counts(normalize=True) * 100)

In [None]:
# 2. Label Multi-Classes : Fusionner DoS et DDoS
df['Attack_Merged'] = df['Attack Type'].replace({
    'DoS': 'DoS_DDoS',
    'DDoS': 'DoS_DDoS'
})

print("Distribution apr√®s fusion:")
print(df['Attack_Merged'].value_counts())

In [None]:
# Encoder les labels multi-classes
le = LabelEncoder()
df['Multiclass_Label'] = le.fit_transform(df['Attack_Merged'])

# Sauvegarder l'encodeur
joblib.dump(le, 'label_encoder.pkl')

print("Mapping num√©rique:")
for i, label in enumerate(le.classes_):
    count = (df['Multiclass_Label'] == i).sum()
    print(f"{i}: {label:20s} {count:8,} instances")

## üìä Visualisation de la Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Binaire
df['Binary_Label'].value_counts().plot(kind='bar', ax=axes[0], color=['green', 'red'])
axes[0].set_title('Distribution Binaire', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Label (0=Normal, 1=Attaque)')
axes[0].set_ylabel('Nombre d\'Instances')
axes[0].set_xticklabels(['Normal', 'Attaque'], rotation=0)

# Multi-classes
df['Attack_Merged'].value_counts().plot(kind='bar', ax=axes[1], color='skyblue')
axes[1].set_title('Distribution Multi-Classes', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Type d\'Attaque')
axes[1].set_ylabel('Nombre d\'Instances')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('distribution_labels.png', dpi=300, bbox_inches='tight')
plt.show()

## üéØ Pr√©paration des Features

In [None]:
# S√©parer features et labels
label_cols = ['Attack Type', 'Binary_Label', 'Attack_Merged', 'Multiclass_Label']
feature_cols = [col for col in df.columns if col not in label_cols]

X = df[feature_cols]
y_binary = df['Binary_Label']
y_multiclass = df['Multiclass_Label']

print(f"Features: {X.shape}")
print(f"Binary labels: {y_binary.shape}")
print(f"Multiclass labels: {y_multiclass.shape}")

## üîÄ Split Train/Test

In [None]:
# Split stratifi√© pour garder la distribution
X_train, X_test, y_binary_train, y_binary_test = train_test_split(
    X, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

# R√©cup√©rer aussi les labels multi-classes correspondants
y_multi_train = y_multiclass.loc[X_train.index]
y_multi_test = y_multiclass.loc[X_test.index]

print(f"Train: {X_train.shape}")
print(f"Test: {X_test.shape}")
print(f"\nDistribution train (binaire):")
print(y_binary_train.value_counts())

## üìè Normalisation

In [None]:
# Normaliser les features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Sauvegarder le scaler
joblib.dump(scaler, 'scaler.pkl')

print(f"‚úÖ Features normalis√©es")
print(f"Mean: {X_train_scaled.mean():.6f}")
print(f"Std: {X_train_scaled.std():.6f}")

## üéØ MOD√àLE 1 : D√©tection Binaire

**Objectif** : D√©tecter si le trafic est normal ou une attaque

In [None]:
# Calculer le poids pour √©quilibrer
scale_pos_weight = len(y_binary_train[y_binary_train==0]) / len(y_binary_train[y_binary_train==1])
print(f"Scale pos weight: {scale_pos_weight:.2f}")

In [None]:
# Entra√Æner XGBoost pour d√©tection binaire
print("üöÄ Entra√Ænement du Mod√®le 1 (D√©tection Binaire)...")

model1 = XGBClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)

model1.fit(X_train_scaled, y_binary_train)
print("‚úÖ Mod√®le 1 entra√Æn√©")

In [None]:
# Pr√©dictions
y_binary_pred = model1.predict(X_test_scaled)
y_binary_proba = model1.predict_proba(X_test_scaled)[:, 1]

# √âvaluation
print("="*70)
print("MOD√àLE 1 : D√âTECTION BINAIRE")
print("="*70)
print(classification_report(y_binary_test, y_binary_pred, target_names=['Normal', 'Attaque']))

# AUC-ROC
auc = roc_auc_score(y_binary_test, y_binary_proba)
print(f"\nAUC-ROC: {auc:.4f}")

In [None]:
# Matrice de confusion
cm = confusion_matrix(y_binary_test, y_binary_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Attaque'],
            yticklabels=['Normal', 'Attaque'])
plt.title('Matrice de Confusion - Mod√®le 1 (Binaire)', fontsize=14, fontweight='bold')
plt.ylabel('Vraie Classe')
plt.xlabel('Classe Pr√©dite')
plt.tight_layout()
plt.savefig('confusion_matrix_binary.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Sauvegarder le mod√®le 1
joblib.dump(model1, 'model1_binary.pkl')
print("‚úÖ Mod√®le 1 sauvegard√©: model1_binary.pkl")

## üéØ MOD√àLE 2 : Classification Multi-Classes

**Objectif** : Identifier le type d'attaque exact

**Strat√©gie** : Entra√Æner seulement sur les attaques d√©tect√©es

In [None]:
# Filtrer seulement les attaques dans le train set
attack_mask_train = y_binary_train == 1
X_train_attacks = X_train_scaled[attack_mask_train]
y_multi_train_attacks = y_multi_train[attack_mask_train]

print(f"Donn√©es d'entra√Ænement (attaques seulement): {X_train_attacks.shape}")
print(f"\nDistribution:")
print(y_multi_train_attacks.value_counts())

In [None]:
# Appliquer SMOTE pour √©quilibrer les classes minoritaires
print("üîÑ Application de SMOTE...")

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_resampled, y_multi_resampled = smote.fit_resample(X_train_attacks, y_multi_train_attacks)

print(f"‚úÖ Apr√®s SMOTE: {X_train_resampled.shape}")
print(f"\nDistribution apr√®s SMOTE:")
print(pd.Series(y_multi_resampled).value_counts())

In [None]:
# Entra√Æner XGBoost pour classification multi-classes
print("üöÄ Entra√Ænement du Mod√®le 2 (Multi-Classes)...")

model2 = XGBClassifier(
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

model2.fit(X_train_resampled, y_multi_resampled)
print("‚úÖ Mod√®le 2 entra√Æn√©")

In [None]:
# √âvaluer sur les attaques du test set
attack_mask_test = y_binary_test == 1
X_test_attacks = X_test_scaled[attack_mask_test]
y_multi_test_attacks = y_multi_test[attack_mask_test]

y_multi_pred = model2.predict(X_test_attacks)

print("="*70)
print("MOD√àLE 2 : CLASSIFICATION MULTI-CLASSES")
print("="*70)
print(classification_report(y_multi_test_attacks, y_multi_pred, target_names=le.classes_))

In [None]:
# Matrice de confusion multi-classes
cm_multi = confusion_matrix(y_multi_test_attacks, y_multi_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm_multi, annot=True, fmt='d', cmap='YlOrRd',
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.title('Matrice de Confusion - Mod√®le 2 (Multi-Classes)', fontsize=14, fontweight='bold')
plt.ylabel('Vraie Classe')
plt.xlabel('Classe Pr√©dite')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('confusion_matrix_multiclass.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Sauvegarder le mod√®le 2
joblib.dump(model2, 'model2_multiclass.pkl')
print("‚úÖ Mod√®le 2 sauvegard√©: model2_multiclass.pkl")

## üîó Syst√®me Hybride : Combinaison des 2 Mod√®les

In [None]:
class HybridIDS:
    """
    Syst√®me de d√©tection d'intrusion hybride
    Combine d√©tection binaire et classification multi-classes
    """
    
    def __init__(self, binary_model, multiclass_model, scaler, label_encoder):
        self.binary_model = binary_model
        self.multiclass_model = multiclass_model
        self.scaler = scaler
        self.le = label_encoder
    
    def predict(self, X):
        """
        Pr√©diction hybride en 2 √©tapes
        
        Args:
            X: Features (DataFrame ou array)
        
        Returns:
            Liste de dictionnaires avec type et confiance
        """
        # Normaliser
        X_scaled = self.scaler.transform(X)
        
        # √âtape 1 : D√©tection binaire
        is_attack = self.binary_model.predict(X_scaled)
        binary_proba = self.binary_model.predict_proba(X_scaled)
        
        results = []
        
        for i, (attack_flag, proba) in enumerate(zip(is_attack, binary_proba)):
            if attack_flag == 0:
                # Trafic normal
                results.append({
                    'type': 'Normal Traffic',
                    'confidence': float(proba[0]),
                    'is_attack': False
                })
            else:
                # Attaque d√©tect√©e ‚Üí √âtape 2 : Classification
                attack_type_encoded = self.multiclass_model.predict(X_scaled[i:i+1])[0]
                attack_proba = self.multiclass_model.predict_proba(X_scaled[i:i+1])[0]
                attack_type = self.le.inverse_transform([attack_type_encoded])[0]
                
                results.append({
                    'type': attack_type,
                    'confidence': float(attack_proba.max()),
                    'is_attack': True
                })
        
        return results
    
    def predict_df(self, X):
        """
        Pr√©diction avec r√©sultat en DataFrame
        """
        results = self.predict(X)
        return pd.DataFrame(results)

print("‚úÖ Classe HybridIDS cr√©√©e")

In [None]:
# Cr√©er le syst√®me hybride
hybrid_ids = HybridIDS(
    binary_model=model1,
    multiclass_model=model2,
    scaler=scaler,
    label_encoder=le
)

print("‚úÖ Syst√®me hybride initialis√©")

In [None]:
# Tester sur un √©chantillon
sample_size = 100
X_sample = X_test.iloc[:sample_size]
y_true = df.loc[X_sample.index, 'Attack Type']

# Pr√©dictions hybrides
predictions = hybrid_ids.predict_df(X_sample)

# Comparer avec la v√©rit√©
comparison = pd.DataFrame({
    'True_Label': y_true.values,
    'Predicted_Label': predictions['type'].values,
    'Confidence': predictions['confidence'].values,
    'Is_Attack': predictions['is_attack'].values
})

print("√âchantillon de pr√©dictions:")
print(comparison.head(20))

In [None]:
# Calculer l'accuracy du syst√®me hybride
correct = (comparison['True_Label'] == comparison['Predicted_Label']).sum()
accuracy = correct / len(comparison) * 100

print(f"\n{'='*70}")
print(f"PERFORMANCE DU SYST√àME HYBRIDE")
print(f"{'='*70}")
print(f"Accuracy: {accuracy:.2f}%")
print(f"Pr√©dictions correctes: {correct}/{len(comparison)}")
print(f"{'='*70}")

## üíæ Sauvegarder le Syst√®me Complet

In [None]:
# Sauvegarder tous les composants
import pickle

hybrid_system = {
    'binary_model': model1,
    'multiclass_model': model2,
    'scaler': scaler,
    'label_encoder': le,
    'feature_names': X.columns.tolist()
}

with open('hybrid_ids_system.pkl', 'wb') as f:
    pickle.dump(hybrid_system, f)

print("‚úÖ Syst√®me hybride complet sauvegard√©: hybrid_ids_system.pkl")

## üì§ Export vers Google Drive

In [None]:
# üíæ Sauvegarder sur Google Drive
import os
import shutil

def save_to_drive(source_file, destination_folder='NetGuardian_Models/Hybrid'):
    """Sauvegarde vers Google Drive"""
    try:
        from google.colab import drive
        if not os.path.exists('/content/drive'):
            drive.mount('/content/drive')
        
        drive_path = f"/content/drive/MyDrive/{destination_folder}"
        os.makedirs(drive_path, exist_ok=True)
        
        if os.path.exists(source_file):
            dest = os.path.join(drive_path, os.path.basename(source_file))
            shutil.copy2(source_file, dest)
            print(f"‚úÖ Sauvegard√© sur Drive: {dest}")
        else:
            print(f"‚ö†Ô∏è Fichier introuvable: {source_file}")
        
    except ImportError:
        print("‚ö†Ô∏è Pas sur Colab. Sauvegarde locale conserv√©e.")
    except Exception as e:
        print(f"‚ùå Erreur: {e}")

# Exporter les fichiers cl√©s
save_to_drive('hybrid_ids_system.pkl')
save_to_drive('scaler.pkl')
save_to_drive('label_encoder.pkl')
save_to_drive('model1_binary.pkl')
save_to_drive('model2_multiclass.pkl')

print("\nüöÄ Export termin√© !")

## üìä R√©sum√© Final

In [None]:
print("="*70)
print("R√âSUM√â DE L'ENTRA√éNEMENT HYBRIDE")
print("="*70)
print(f"\nüìä Dataset:")
print(f"   - Total instances: {len(df):,}")
print(f"   - Features: {X.shape[1]}")
print(f"   - Classes: {len(le.classes_)}")

print(f"\nüéØ Mod√®le 1 (Binaire):")
print(f"   - Type: XGBoost")
print(f"   - Classes: Normal vs Attaque")
print(f"   - Fichier: model1_binary.pkl")

print(f"\nüéØ Mod√®le 2 (Multi-Classes):")
print(f"   - Type: XGBoost + SMOTE")
print(f"   - Classes: {len(le.classes_)}")
print(f"   - Fichier: model2_multiclass.pkl")

print(f"\nüíæ Fichiers g√©n√©r√©s:")
print(f"   - model1_binary.pkl")
print(f"   - model2_multiclass.pkl")
print(f"   - scaler.pkl")
print(f"   - label_encoder.pkl")
print(f"   - hybrid_ids_system.pkl")

print(f"\n‚úÖ Syst√®me hybride pr√™t pour d√©ploiement!")
print("="*70)

## üì• Prochaines √âtapes

1. ‚úÖ Syst√®me hybride entra√Æn√© et test√©
2. ‚úÖ Mod√®les sauvegard√©s sur Drive
3. ‚è≥ Int√©grer dans NetGuardian-AI dashboard
4. ‚è≥ Tester en temps r√©el

---

**F√©licitations ! Votre IDS hybride est pr√™t ! üéâ**