In [None]:
from google.colab import files

uploaded = files.upload()  # Manually upload your CSV file


Saving processed_data.csv to processed_data.csv


In [None]:
!pip install xgboost





Training the model

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# 📌 Load dataset
columns = ["Timestamp", "Src_IP", "Dst_IP", "Src_Port", "Dst_Port", "Protocol",
           "Packet_Size", "TTL", "Flags", "Attack_Type", "Extra1", "Extra2"]
data = pd.read_csv("processed_data.csv", names=columns)

# 🚀 Step 1: Drop unnecessary columns (IP addresses & extra info)
data.drop(["Src_IP", "Dst_IP", "Extra1", "Extra2"], axis=1, inplace=True)

# 🚀 Step 2: Convert numeric columns
numeric_columns = ["Timestamp", "Packet_Size", "TTL"]
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors="coerce")  # Convert to numeric

# 🚀 Step 3: Encode categorical features
label_encoders = {}
for col in ["Flags", "Protocol", "Src_Port", "Dst_Port"]:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))  # Convert to string before encoding
    label_encoders[col] = le

# 🚀 Step 4: Encode Target Variable (Attack_Type)
attack_counts = data["Attack_Type"].value_counts()
print("\n🔥 Attack Type Distribution Before Handling Rare Classes:\n", attack_counts)

# ✅ Handle rare attack types (replace classes with <2 instances)
rare_classes = attack_counts[attack_counts < 2].index.tolist()
if rare_classes:
    print("\n⚠️ Replacing rare classes:", rare_classes)
    data["Attack_Type"] = data["Attack_Type"].replace(rare_classes, "Other Attack")

# ✅ Re-encode the modified `Attack_Type`
attack_encoder = LabelEncoder()
data["Attack_Type"] = attack_encoder.fit_transform(data["Attack_Type"])

# 🚀 Step 5: Handle NaN values (only for numeric columns)
data.fillna(data.mean(), inplace=True)

# 🚀 Step 6: Split into X (features) and y (target)
X = data.drop("Attack_Type", axis=1)
y = data["Attack_Type"]

# 🚀 Step 7: Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 🚀 Step 8: Handle class imbalance with SMOTE (Fixing Rare Class Issue)
min_class_size = min(Counter(y).values())  # Smallest class count
smote_neighbors = min(5, min_class_size - 1)  # Ensure `n_neighbors <= n_samples - 1`

if min_class_size > 1:  # ✅ Apply SMOTE only if the smallest class has enough samples
    smote = SMOTE(sampling_strategy="auto", k_neighbors=smote_neighbors, random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
    print("\n✅ SMOTE Applied - New Class Distribution:", Counter(y_resampled))
else:
    print("\n⚠️ SMOTE Skipped - Not enough samples per class.")
    X_resampled, y_resampled = X_scaled, y  # Keep original dataset

# 🚀 Step 9: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# ✅ Step 10: Train Optimized Random Forest Model (Prevent Overfitting)
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features="sqrt",
    class_weight="balanced",
    random_state=42
)

# 🚀 Step 11: Cross-Validation to check generalization
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=cv, scoring="accuracy")
print(f"\n✅ Cross-Validation Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# 🚀 Step 12: Train the Model
rf_model.fit(X_train, y_train)

# 🚀 Step 13: Predict & Evaluate
y_pred = rf_model.predict(X_test)
print("\n🔥 Model Evaluation:")
print(classification_report(y_test, y_pred))

# 🚀 Step 14: Feature Importance Analysis
feature_importance = pd.DataFrame({"Feature": X.columns, "Importance": rf_model.feature_importances_})
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)
print("\n🔍 Top 5 Important Features:\n", feature_importance.head(5))



🔥 Attack Type Distribution Before Handling Rare Classes:
 Attack_Type
Brute Force Attack      4986
Normal                  3890
ICMP Flood              3836
Slow Port Scan          1330
SQL Injection            483
Unusual Port Scan        167
UDP Flood                 98
Aggressive Port Scan      92
SYN Flood                 44
Attack_Type                1
Nmap SYN Scan              1
Name: count, dtype: int64

⚠️ Replacing rare classes: ['Attack_Type', 'Nmap SYN Scan']

✅ SMOTE Applied - New Class Distribution: Counter({4: 4986, 3: 4986, 6: 4986, 1: 4986, 8: 4986, 2: 4986, 9: 4986, 7: 4986, 0: 4986, 5: 4986})

✅ Cross-Validation Accuracy: 0.9359 ± 0.0096

🔥 Model Evaluation:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       997
           1       0.99      1.00      1.00       997
           2       1.00      1.00      1.00       997
           3       1.00      0.89      0.94       998
           4       1.00      1.00      1.0

🔥 Review of the Model Performance
✅ Strengths
High Overall Accuracy (93.59%)

Your model is highly accurate in detecting most attack types.

The precision and recall values for most classes are close to 1.00, meaning your model makes very few mistakes.

Balanced Class Distribution (After SMOTE)

All attack types have equal representation (4,986 instances each), preventing class imbalance issues.

Feature Importance Makes Sense

Timestamp (30.57%) is the most influential feature, which makes sense for attacks like Port Scans & DoS.

Protocol (14.67%) and Packet Size (14.17%) also play key roles, indicating that different attacks might manipulate packet sizes and protocols.

⚠️ Areas of Concern & Possible Improvements
1️⃣ Class 7 (Unusual Port Scan) has Poor Recall (0.60)
Issue: Your model struggles to correctly detect Unusual Port Scan attacks, leading to many false negatives.

Fix: Try adjusting class weights or adding synthetic features like "Port Change Frequency" to improve detection.

2️⃣ Class 9 (SYN Flood) Has Low Precision (0.68)
Issue: The model misclassifies other attacks as SYN Flood, leading to false positives.

Fix: Increase tree depth (max_depth=12) or add more network-related features (e.g., "Packet Drop Rate").

3️⃣ Feature Importance is Dominated by Timestamp
Issue: If Timestamp is too dominant, it might be overfitting to time-based patterns instead of learning general attack behavior.

Fix:

Try removing Timestamp and retraining the model to see if performance changes significantly.

Use time-based aggregation (e.g., "Packets per second") instead of raw timestamps.




Saving the model


In [None]:
import joblib

# ✅ Save the trained model
joblib.dump(rf_model, "cyber_threat_model.pkl")
print("✅ Model saved as cyber_threat_model.pkl")

# ✅ Save the scaler for feature normalization
joblib.dump(scaler, "scaler.pkl")
print("✅ Scaler saved as scaler.pkl")

# ✅ Save label encoders for categorical features
joblib.dump(label_encoders, "label_encoders.pkl")
print("✅ Label encoders saved as label_encoders.pkl")


✅ Model saved as cyber_threat_model.pkl
✅ Scaler saved as scaler.pkl
✅ Label encoders saved as label_encoders.pkl


In [None]:
import joblib

# ✅ Create mapping of encoded attack labels to attack names
attack_classes = {i: label for i, label in enumerate(attack_encoder.classes_)}

# ✅ Save mapping
joblib.dump(attack_classes, "attack_classes.pkl")
print("\n✅ Attack class mapping saved to attack_classes.pkl")


✅ Attack class mapping saved to attack_classes.pkl
