# üîç CICIDS2017 - Data Exploration & Cleaning

**Objectif** : Explorer et nettoyer le dataset CICIDS2017 sur Kaggle

**Dataset** : https://www.kaggle.com/datasets/cicdataset/cicids2017

---

## üì¶ Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Configuration des graphiques
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')


## üìä Charger le Dataset

Commen√ßons par un seul fichier pour l'exploration

In [None]:
# Charger un fichier (Monday - trafic normal uniquement)
df = pd.read_csv('/kaggle/input/cicids2017/Monday-WorkingHours.pcap_ISCX.csv')

print(f"Shape: {df.shape}")
print(f"Taille en m√©moire: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## üîç Exploration Initiale

In [None]:
# Premi√®res lignes
df.head()

In [None]:
# Informations g√©n√©rales
df.info()

In [None]:
# Statistiques descriptives
df.describe()

In [None]:
# Distribution des labels
print("Distribution des labels:")
print(df[' Label'].value_counts())
print(f"\nPourcentages:")
print(df[' Label'].value_counts(normalize=True) * 100)

## üìà Visualisations

In [None]:
# Distribution des labels
plt.figure(figsize=(14, 6))
df[' Label'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution des Labels - CICIDS2017', fontsize=16, fontweight='bold')
plt.xlabel('Type d\'Attaque', fontsize=12)
plt.ylabel('Nombre d\'Instances', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('label_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## üßπ D√©tection des Probl√®mes

In [None]:
# 1. V√©rifier les valeurs manquantes
print("=" * 50)
print("1. VALEURS MANQUANTES (NaN)")
print("=" * 50)
nan_counts = df.isnull().sum()
if nan_counts.sum() > 0:
    print(nan_counts[nan_counts > 0])
else:
    print("‚úÖ Aucune valeur NaN d√©tect√©e")

print(f"\nTotal NaN: {nan_counts.sum()}")

In [None]:
# 2. V√©rifier les valeurs infinies
print("=" * 50)
print("2. VALEURS INFINIES")
print("=" * 50)

numeric_cols = df.select_dtypes(include=[np.number]).columns
inf_counts = {}

for col in numeric_cols:
    inf_count = np.isinf(df[col]).sum()
    if inf_count > 0:
        inf_counts[col] = inf_count

if inf_counts:
    for col, count in sorted(inf_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{col}: {count}")
else:
    print("‚úÖ Aucune valeur infinie d√©tect√©e")

print(f"\nTotal colonnes avec infinis: {len(inf_counts)}")

In [None]:
# 3. V√©rifier les duplications
print("=" * 50)
print("3. DUPLICATIONS")
print("=" * 50)
duplicates = df.duplicated().sum()
print(f"Nombre de lignes dupliqu√©es: {duplicates}")
print(f"Pourcentage: {duplicates / len(df) * 100:.2f}%")

In [None]:
# 4. V√©rifier les noms de colonnes
print("=" * 50)
print("4. NOMS DE COLONNES")
print("=" * 50)
print(f"Nombre de colonnes: {len(df.columns)}")
print("\nColonnes avec espaces:")
for col in df.columns:
    if col.startswith(' ') or col.endswith(' '):
        print(f"  - '{col}'")

## üõ†Ô∏è Fonction de Nettoyage

In [None]:
def clean_cicids2017(df):
    """
    Nettoie le dataset CICIDS2017
    
    Args:
        df: DataFrame pandas
    
    Returns:
        DataFrame nettoy√©
    """
    print("üßπ Nettoyage en cours...")
    print(f"Shape initiale: {df.shape}")
    
    # 1. Nettoyer les noms de colonnes
    df.columns = df.columns.str.strip().str.replace(' ', '_')
    print("‚úÖ Noms de colonnes nettoy√©s")
    
    # 2. Supprimer les duplications
    initial_rows = len(df)
    df = df.drop_duplicates()
    duplicates_removed = initial_rows - len(df)
    print(f"‚úÖ Duplications supprim√©es: {duplicates_removed}")
    
    # 3. G√©rer les valeurs infinies
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print("‚úÖ Valeurs infinies remplac√©es par NaN")
    
    # 4. G√©rer les NaN
    nan_before = df.isnull().sum().sum()
    
    # Remplir avec la m√©diane pour les colonnes num√©riques
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().any():
            df[col].fillna(df[col].median(), inplace=True)
    
    nan_after = df.isnull().sum().sum()
    print(f"‚úÖ NaN trait√©s: {nan_before} ‚Üí {nan_after}")
    
    # 5. Corriger les valeurs n√©gatives incorrectes
    positive_cols = ['Flow_Duration', 'Total_Fwd_Packets', 'Total_Backward_Packets']
    for col in positive_cols:
        if col in df.columns:
            negative_count = (df[col] < 0).sum()
            if negative_count > 0:
                df.loc[df[col] < 0, col] = 0
                print(f"‚úÖ {col}: {negative_count} valeurs n√©gatives corrig√©es")
    
    # 6. Supprimer les colonnes non pertinentes pour ML
    cols_to_drop = ['Flow_ID', 'Source_IP', 'Destination_IP', 'Timestamp']
    existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
    if existing_cols_to_drop:
        df = df.drop(columns=existing_cols_to_drop)
        print(f"‚úÖ Colonnes supprim√©es: {existing_cols_to_drop}")
    
    print(f"\nüéâ Nettoyage termin√©!")
    print(f"Shape finale: {df.shape}")
    
    return df

## üß™ Appliquer le Nettoyage

In [None]:
# Nettoyer le dataset
df_clean = clean_cicids2017(df.copy())

In [None]:
# V√©rifier le r√©sultat
print("V√©rification post-nettoyage:")
print(f"Shape: {df_clean.shape}")
print(f"NaN: {df_clean.isnull().sum().sum()}")
print(f"Infinis: {np.isinf(df_clean.select_dtypes(include=[np.number])).sum().sum()}")
print(f"Duplications: {df_clean.duplicated().sum()}")

## üè∑Ô∏è Mapping des Labels

In [None]:
# Dictionnaire de mapping
ATTACK_CATEGORIES = {
    'BENIGN': 'Normal',
    'FTP-Patator': 'Brute_Force',
    'SSH-Patator': 'Brute_Force',
    'Web Attack ‚Äì Brute Force': 'Brute_Force',
    'DoS slowloris': 'DoS_DDoS',
    'DoS Slowhttptest': 'DoS_DDoS',
    'DoS Hulk': 'DoS_DDoS',
    'DoS GoldenEye': 'DoS_DDoS',
    'DDoS': 'DoS_DDoS',
    'Web Attack ‚Äì XSS': 'Web_Attack',
    'Web Attack ‚Äì SQL Injection': 'Web_Attack',
    'PortScan': 'Reconnaissance',
    'Bot': 'Botnet',
    'Infiltration': 'Advanced_Threat',
    'Heartbleed': 'Advanced_Threat'
}

# Appliquer le mapping
df_clean['Attack_Category'] = df_clean['Label'].map(ATTACK_CATEGORIES)

# V√©rifier
print("Distribution des cat√©gories d'attaques:")
print(df_clean['Attack_Category'].value_counts())

In [None]:
# Encoder num√©riquement
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_clean['Label_Encoded'] = le.fit_transform(df_clean['Attack_Category'])

# Afficher le mapping
print("Mapping num√©rique:")
for i, label in enumerate(le.classes_):
    print(f"{i}: {label}")

## üíæ Sauvegarder le Dataset Nettoy√©

In [None]:
# Sauvegarder
output_file = 'cicids2017_monday_cleaned.csv'
df_clean.to_csv(output_file, index=False)
print(f"‚úÖ Dataset sauvegard√©: {output_file}")
print(f"Taille: {df_clean.shape}")

## üìä Traiter Tous les Fichiers (Optionnel)

‚ö†Ô∏è **Attention** : Cela peut prendre beaucoup de temps et de m√©moire !

In [None]:
# Liste de tous les fichiers
files = [
    'Monday-WorkingHours.pcap_ISCX.csv',
    'Tuesday-WorkingHours.pcap_ISCX.csv',
    'Wednesday-workingHours.pcap_ISCX.csv',
    'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    'Friday-WorkingHours-Morning.pcap_ISCX.csv',
    'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
]

# Traiter tous les fichiers
all_dfs = []

for file in files:
    print(f"\n{'='*60}")
    print(f"Traitement de: {file}")
    print(f"{'='*60}")
    
    try:
        # Charger
        df_temp = pd.read_csv(f'/kaggle/input/cicids2017/{file}')
        
        # Nettoyer
        df_temp_clean = clean_cicids2017(df_temp)
        
        # Mapper les labels
        df_temp_clean['Attack_Category'] = df_temp_clean['Label'].map(ATTACK_CATEGORIES)
        
        # Ajouter √† la liste
        all_dfs.append(df_temp_clean)
        
        print(f"‚úÖ {file} trait√© avec succ√®s")
        
    except Exception as e:
        print(f"‚ùå Erreur avec {file}: {e}")

# Combiner tous les DataFrames
print(f"\n{'='*60}")
print("Combinaison de tous les fichiers...")
print(f"{'='*60}")

df_final = pd.concat(all_dfs, ignore_index=True)

print(f"\nüéâ Dataset final combin√©!")
print(f"Shape: {df_final.shape}")
print(f"\nDistribution finale des labels:")
print(df_final['Attack_Category'].value_counts())

In [None]:
# Encoder les labels finaux
le_final = LabelEncoder()
df_final['Label_Encoded'] = le_final.fit_transform(df_final['Attack_Category'])

# Sauvegarder le dataset complet
df_final.to_csv('cicids2017_full_cleaned.csv', index=False)
print("‚úÖ Dataset complet sauvegard√©: cicids2017_full_cleaned.csv")

## üì• Prochaines √âtapes

1. ‚úÖ Dataset explor√© et nettoy√©
2. ‚úÖ Labels mapp√©s en cat√©gories
3. ‚úÖ Fichier CSV sauvegard√©

**√Ä faire ensuite :**
- T√©l√©charger le fichier depuis Kaggle Output
- Placer dans `data/processed/` de votre projet local
- Commit sur Git
- Passer au Feature Engineering et Training !

---

**Bon courage ! üöÄ**