In [2]:
# Imports
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load and combine datasets
tuesday = pd.read_csv('tuesday_plus_cleaned.csv')
wednesday = pd.read_csv('wednesday_plus_cleaned.csv')
thursday = pd.read_csv('thursday_plus_cleaned.csv')
friday = pd.read_csv('friday_plus_cleaned.csv')

df = pd.concat([tuesday, wednesday, thursday, friday], ignore_index=True)

print(f"Combined dataset shape: {df.shape}")
print("Label distribution:\n", df['Label'].value_counts())

# Step 2: Prepare features and target
# Drop columns that are identifiers or non-feature columns if any (adjust as needed)
# For example, if columns like 'Flow ID', 'Timestamp' etc. exist, drop them here
leak_columns = ['Flow ID', 'Timestamp', 'Source IP', 'Destination IP']  # adjust if needed
for col in leak_columns:
    if col in df.columns:
        df = df.drop(columns=[col])

X = df.drop(columns=['Label'])
y = df['Label']

# Step 3: Encode target labels (multi-class)
le = LabelEncoder()
y_enc = le.fit_transform(y)

print(f"Encoded classes: {list(le.classes_)}")

# Step 4: Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.3, random_state=42, stratify=y_enc
)

print("Original training set shape:", Counter(y_train))

from collections import Counter
print("Training set label distribution:", Counter(y_train))

# Find smallest class count in training
min_class_count = min(Counter(y_train).values())
print(f"Smallest class count in training: {min_class_count}")

from imblearn.over_sampling import SMOTE

# Use fewer neighbors to match your data
smote = SMOTE(random_state=42, k_neighbors=2)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)



# Step 5: Handle imbalance with SMOTE on training data only
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Resampled training set shape:", Counter(y_train_res))

# Step 6: Train Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_res, y_train_res)

# Step 7: Predict on test set
y_pred = rf.predict(X_test)

# Step 8: Evaluation
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_, yticklabels=le.classes_, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Multi-class Classification')
plt.show()

# Step 9: Feature importance plot
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
features = X.columns

plt.figure(figsize=(12,8))
sns.barplot(x=importances[indices[:20]], y=features[indices[:20]], palette='viridis')
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()



Combined dataset shape: (1720991, 96)
Label distribution:
 Label
Benign                                    1206738
Portscan                                   159059
DoS Hulk                                   158468
DDoS                                        95144
Infiltration - Portscan                     68620
DoS GoldenEye                                7567
Botnet - Attempted                           4064
FTP-Patator                                  3972
DoS Slowloris                                3859
DoS Slowhttptest - Attempted                 3368
SSH-Patator                                  2961
DoS Slowloris - Attempted                    1847
DoS Slowhttptest                             1740
Web Attack - Brute Force - Attempted         1292
Botnet                                        736
Web Attack - XSS - Attempted                  655
DoS Hulk - Attempted                          581
DoS GoldenEye - Attempted                      80
Web Attack - Brute Force           

: 