In [47]:
import pandas as pd
import numpy as np
import os
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib


In [None]:
folder = 'CIC-IDS-2017'
csv_files = glob(os.path.join(folder, '*.csv'))
dfs = []

for path in csv_files:
    print(f"‚úÖ Reading: {os.path.basename(path)}")
    try:
        df = pd.read_csv(path, low_memory=False)
        df.columns = df.columns.str.strip()
        df = df.replace([np.inf, -np.inf], np.nan).dropna()
        if 'Label' in df.columns:
            dfs.append(df)
    except Exception as e:
        print(f"‚ùå Skipped {path}: {e}")

# Loading the Web Attack file manually and appending it
web_path = 'CIC-IDS-2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv'
df_web = pd.read_csv(web_path, low_memory=False)
df_web.columns = df_web.columns.str.strip()
df_web = df_web.replace([np.inf, -np.inf], np.nan).dropna()
df_web['Label'] = df_web['Label'].astype(str).str.strip().str.replace('‚Äì', '-')  # fix dashes

dfs.append(df_web)

# Combineing everything
df = pd.concat(dfs, ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"üì¶ Combined dataset shape: {df.shape}")


‚úÖ Reading: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
‚úÖ Reading: Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
‚úÖ Reading: Friday-WorkingHours-Morning.pcap_ISCX.csv
‚úÖ Reading: Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
‚úÖ Reading: Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
üì¶ Combined dataset shape: (1331575, 79)


In [None]:
def normalize_label(x):
    x = str(x).strip().replace('‚Äì', '-').replace('  ', ' ')
    ux = x.upper()

    if 'BENIGN' in ux:
        return 'BENIGN'
    if 'DDOS' in ux:
        return 'DDoS'
    if 'PORTSCAN' in ux:
        return 'PortScan'
    if 'INFILT' in ux:
        return 'Infiltration'
    if 'WEB ATTACK' in ux:
        return x  # preserve specific Web Attack type
    return x

df['Label'] = df['Label'].apply(normalize_label)

#labels
allowed_labels = [
    'BENIGN', 'DDoS', 'PortScan', 'Infiltration',
    'Web Attack - Brute Force',
    'Web Attack - Sql Injection',
    'Web Attack - XSS'
]

df = df[df['Label'].isin(allowed_labels)].copy()

print("üìä Final class distribution:")
print(df['Label'].value_counts())


üìä Final class distribution:
Label
BENIGN                        1038394
PortScan                       158804
DDoS                           128025
Web Attack - Brute Force         3014
Web Attack - XSS                 1304
Web Attack - Sql Injection         42
Infiltration                       36
Name: count, dtype: int64


In [None]:
encoder = LabelEncoder()
df['Label_enc'] = encoder.fit_transform(df['Label'])

# Save labels
joblib.dump(encoder, 'label_encoder.pkl')

# checks lables
print("üî§ Label encoding map:")
for i, cls in enumerate(encoder.classes_):
    print(f"{i} ‚Üí {cls}")


üî§ Label encoding map:
0 ‚Üí BENIGN
1 ‚Üí DDoS
2 ‚Üí Infiltration
3 ‚Üí PortScan
4 ‚Üí Web Attack - Brute Force
5 ‚Üí Web Attack - Sql Injection
6 ‚Üí Web Attack - XSS


In [42]:
features = [
    'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
    'Fwd Packet Length Max', 'Bwd Packet Length Max',
    'Fwd Packets/s', 'Bwd Packets/s',
    'Flow IAT Mean', 'Fwd IAT Mean', 'Bwd IAT Mean'
]

df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=features + ['Label_enc'])

X = df[features]
y = df['Label_enc']

print(f"‚úÖ Final dataset shape: {X.shape[0]} rows")


‚úÖ Final dataset shape: 1329619 rows


In [43]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

joblib.dump(scaler, 'scaler.pkl')

print(f"üìä Train: {len(X_train)} | Test: {len(X_test)}")


üìä Train: 1063695 | Test: 265924


In [44]:
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

joblib.dump(model, 'rf_model.pkl')
print("‚úÖ Model saved as rf_model.pkl")


‚úÖ Model saved as rf_model.pkl


In [45]:
y_pred = model.predict(X_test)

print("üìâ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred, target_names=encoder.classes_))


üìâ Confusion Matrix:
[[203054     26      7    197   3610    219    566]
 [    41  25564      0      0      0      0      0]
 [     4      0      3      0      0      0      0]
 [     4      0      0  31756      1      0      0]
 [    35      0      0      0    401      1    166]
 [     0      0      0      0      4      4      0]
 [    13      0      0      0      6      0    242]]

üìä Classification Report:
                            precision    recall  f1-score   support

                    BENIGN       1.00      0.98      0.99    207679
                      DDoS       1.00      1.00      1.00     25605
              Infiltration       0.30      0.43      0.35         7
                  PortScan       0.99      1.00      1.00     31761
  Web Attack - Brute Force       0.10      0.67      0.17       603
Web Attack - Sql Injection       0.02      0.50      0.03         8
          Web Attack - XSS       0.25      0.93      0.39       261

                  accuracy           