In [35]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle (1).json


In [36]:
!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



In [37]:
!kaggle datasets download -d dhoogla/unswnb15 -q
!unzip -q unswnb15.zip

Dataset URL: https://www.kaggle.com/datasets/dhoogla/unswnb15
License(s): CC-BY-NC-SA-4.0
replace UNSW_NB15_testing-set.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace UNSW_NB15_training-set.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [38]:
!kaggle datasets download -d aryashah2k/nfuqnidsv2-network-intrusion-detection-dataset -q
!unzip -q nfuqnidsv2-network-intrusion-detection-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/aryashah2k/nfuqnidsv2-network-intrusion-detection-dataset
License(s): CC0-1.0
replace NF-UQ-NIDS-v2.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt

In [40]:
# ---------------------------------------------------
#  UNSW-NB15 Preprocessing
# ---------------------------------------------------
unsw = pd.read_parquet('UNSW_NB15_training-set.parquet')

# Drop unnecessary columns
unsw.drop(columns=[c for c in ['srcip', 'dstip', 'sport', 'dsport', 'stime', 'ltime'] if c in unsw.columns],
           inplace=True)

# Label encode categorical features
for col in ['proto', 'service', 'state']:
    if col in unsw.columns:
        le = LabelEncoder()
        unsw[col] = le.fit_transform(unsw[col].astype(str))

# Handle missing labels and encode attack labels
unsw['attack_cat'] = unsw['attack_cat'].fillna('Normal')
le_attack = LabelEncoder()
unsw['label_encoded'] = le_attack.fit_transform(unsw['attack_cat'])

# Drop original label column
if 'attack_cat' in unsw.columns:
    unsw.drop(columns=['attack_cat'], inplace=True)

# Replace inf/nan
unsw = unsw.replace([float('inf'), -float('inf')], np.nan).dropna()

# Scale numeric features
numeric_cols_unsw = unsw.select_dtypes(include=['int64','float64']).columns.tolist()
numeric_cols_unsw = [c for c in numeric_cols_unsw if c != 'label_encoded']
scaler = MinMaxScaler()
unsw[numeric_cols_unsw] = scaler.fit_transform(unsw[numeric_cols_unsw])

# Save client 1
unsw.to_csv("/content/client1_UNSW.csv", index=False)
print("UNSW dataset saved as client1_UNSW.csv")

UNSW dataset saved as client1_UNSW.csv


In [41]:
# ---------------------------------------------------
# NF-UQ-NIDS-v2 Preprocessing + Manual Mapping
# ---------------------------------------------------
nfq = pd.read_csv("NF-UQ-NIDS-v2.csv", nrows=200000)
nfq = nfq.sample(n=175341, random_state=42).reset_index(drop=True)

# Drop unnecessary columns
nfq.drop(columns=[c for c in ['Flow ID', 'Source IP', 'Destination IP',
                              'Src Port', 'Dst Port', 'Timestamp',
                              'StartTime', 'EndTime'] if c in nfq.columns],
         inplace=True)

# Label encode categorical features
for col in ['proto', 'service', 'state']:
    if col in nfq.columns:
        le = LabelEncoder()
        nfq[col] = le.fit_transform(nfq[col].astype(str))

# Encode attack labels
nfq['Attack'] = nfq['Attack'].replace('Benign', 'Normal').fillna('Normal')
base_mapping = dict(zip(le_attack.classes_, le_attack.transform(le_attack.classes_)))
current_max = max(base_mapping.values())
for att in nfq['Attack'].unique():
    if att not in base_mapping:
        current_max += 1
        base_mapping[att] = current_max
nfq['label_encoded'] = nfq['Attack'].map(base_mapping).astype(int)

# Drop original attack column
if 'Attack' in nfq.columns:
    nfq.drop(columns=['Attack'], inplace=True)

if 'FLOW_DURATION_MILLISECONDS' in nfq.columns:
    nfq['FLOW_DURATION_MILLISECONDS'] = nfq['FLOW_DURATION_MILLISECONDS'] / 1000

# -----------------------------
#  Manual Feature Mapping
# Map NFQ columns to UNSW-style features
# -----------------------------
mapping = {
    'FLOW_DURATION_MILLISECONDS': 'dur',
    'PROTOCOL': 'proto',
    'IN_PKTS': 'spkts',
    'OUT_PKTS': 'dpkts',
    'IN_BYTES': 'sbytes',
    'OUT_BYTES': 'dbytes',
    'SRC_TO_DST_AVG_THROUGHPUT': 'sload',
    'DST_TO_SRC_AVG_THROUGHPUT': 'dload',
    'RETRANSMITTED_IN_BYTES': 'sloss',
    'RETRANSMITTED_OUT_BYTES': 'dloss',
    'LONGEST_FLOW_PKT': 'sinpkt',
    'SHORTEST_FLOW_PKT': 'dinpkt',
    'TCP_WIN_MAX_OUT': 'dwin',
    'TCP_WIN_MAX_IN': 'swin',
    'IPV4_SRC_ADDR': 'srcip',
    'L4_SRC_PORT': 'srcport',
    'IPV4_DST_ADDR':'dstip',
    'L4_DST_PORT': 'dstport',
    'L7_PROTO': 'service',
    'TCP_FLAGS': 'state',
}

nfq.rename(columns=mapping, inplace=True)

# Keep only columns that exist in UNSW + label_encoded
common_features = [c for c in unsw.columns if c != 'label_encoded' and c in nfq.columns]

# Drop columns in NFQ that are not mapped
nfq = nfq[common_features + ['label_encoded']]

# Scale numeric features
numeric_cols_nfq = nfq.select_dtypes(include=['int64','float64']).columns.tolist()
numeric_cols_nfq = [c for c in numeric_cols_nfq if c != 'label_encoded']
nfq[numeric_cols_nfq] = scaler.fit_transform(nfq[numeric_cols_nfq])

# Save client 2
nfq.to_csv("/content/client2_NFQ.csv", index=False)
print("NF-UQ dataset saved as client2_NFQ.csv")

NF-UQ dataset saved as client2_NFQ.csv


In [43]:
# ---------------------------------------------------
# Unified Attack Mapping for Multi-class Classification
# ---------------------------------------------------

# Define broader unified attack categories
attack_mapping_unified = {
    # Normal traffic
    'Normal': 'Normal',
    'Benign': 'Normal',

    # Denial of Service
    'DoS': 'DoS',
    'DDoS': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS Hulk': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'DoS Slowloris': 'DoS',
    'Service DoS': 'DoS',

    # Scanning & Reconnaissance
    'Reconnaissance': 'Reconnaissance',
    'PortScan': 'Reconnaissance',
    'Scanning': 'Reconnaissance',
    'Fuzzers': 'Reconnaissance',

    # Exploits / Injection
    'Exploits': 'Exploits',
    'Shellcode': 'Exploits',
    'Worms': 'Exploits',
    'Backdoor': 'Exploits',
    'Generic': 'Exploits',
    'SQL Injection': 'Exploits',
    'Command Injection': 'Exploits',
    'Code Injection': 'Exploits',

    # Information Theft
    'Theft': 'Theft',
    'Data Exfiltration': 'Theft',
    'Data Theft': 'Theft',
    'Information Gathering': 'Theft',

    # Web-based Attacks
    'Web Attack': 'Web Attack',
    'Brute Force': 'Web Attack',
    'Cross Site Scripting': 'Web Attack',
    'XSS': 'Web Attack',
    'Infiltration': 'Web Attack',

    # Generic Malware
    'Trojan': 'Malware',
    'Virus': 'Malware',
    'Botnet': 'Malware',
    'Malware': 'Malware',
}

# ---------------------------------------------------
# Apply unified mapping to UNSW dataset
# ---------------------------------------------------
if 'label_encoded' in unsw.columns:
    # Need to recover original label names temporarily
    unsw_labels = le_attack.inverse_transform(unsw['label_encoded'])
    unsw['UnifiedAttack'] = [attack_mapping_unified.get(a, 'Other') for a in unsw_labels]

# ---------------------------------------------------
# Apply unified mapping to NF-UQ dataset
# ---------------------------------------------------
if 'label_encoded' in nfq.columns:
    # If NFQ was mapped via base_mapping, get inverse
    inv_base_mapping = {v: k for k, v in base_mapping.items()}
    nfq_labels = [inv_base_mapping.get(a, 'Normal') for a in nfq['label_encoded']]
    nfq['UnifiedAttack'] = [attack_mapping_unified.get(a, 'Other') for a in nfq_labels]

# ---------------------------------------------------
# Encode unified labels for both datasets consistently
# ---------------------------------------------------
all_labels = list(set(unsw['UnifiedAttack'].unique()).union(set(nfq['UnifiedAttack'].unique())))
le_unified = LabelEncoder()
le_unified.fit(all_labels)

unsw['label_encoded'] = le_unified.transform(unsw['UnifiedAttack'])
nfq['label_encoded'] = le_unified.transform(nfq['UnifiedAttack'])

# Update base mapping with unified labels
base_mapping = dict(zip(le_unified.classes_, le_unified.transform(le_unified.classes_)))

print("Unified Attack Mapping:")
for k, v in base_mapping.items():
    print(f"{v}: {k}")

# Drop helper columns
unsw.drop(columns=['UnifiedAttack'], inplace=True)
nfq.drop(columns=['UnifiedAttack'], inplace=True)


Unified Attack Mapping:
0: DoS
1: Exploits
2: Normal
3: Other
4: Reconnaissance
5: Theft
6: Web Attack


In [44]:
# ---------------------------------------------------
# Print attack types and sample counts across both datasets
# ---------------------------------------------------
# Count samples per attack type in both datasets
unsw_counts = unsw['label_encoded'].value_counts().rename_axis('label_encoded').reset_index(name='UNSW_count')
nfq_counts = nfq['label_encoded'].value_counts().rename_axis('label_encoded').reset_index(name='NFQ_count')

# Merge counts by label
attack_counts = pd.merge(unsw_counts, nfq_counts, on='label_encoded', how='outer').fillna(0)

# Map label codes to class names using base_mapping
inv_base_mapping = {v: k for k, v in base_mapping.items()}
attack_counts['Attack_Type'] = attack_counts['label_encoded'].map(inv_base_mapping)

# Add combined total count
attack_counts['Total_Samples'] = attack_counts['UNSW_count'] + attack_counts['NFQ_count']

# Reorder columns for readability
attack_counts = attack_counts[['Attack_Type', 'label_encoded', 'UNSW_count', 'NFQ_count', 'Total_Samples']]

# Sort by total samples (descending)
attack_counts = attack_counts.sort_values(by='Total_Samples', ascending=False).reset_index(drop=True)

print("\n===== Attack Label Distribution Across Datasets =====")
print(attack_counts.to_string(index=False))
print("=====================================================")



===== Attack Label Distribution Across Datasets =====
   Attack_Type  label_encoded  UNSW_count  NFQ_count  Total_Samples
        Normal              2     56000.0      57609       113609.0
           DoS              0     12264.0      91913       104177.0
      Exploits              1     76402.0        143        76545.0
Reconnaissance              4     28675.0       6103        34778.0
         Other              3      2000.0      19272        21272.0
    Web Attack              6         0.0        298          298.0
         Theft              5         0.0          3            3.0


In [42]:
from sklearn.model_selection import train_test_split


In [45]:
combined_df = pd.concat([unsw, nfq], axis=0).reset_index(drop=True)
X = combined_df.drop(columns=['label_encoded'])
y = combined_df['label_encoded']

print("Feature count:", X.shape[1])
print("Unique classes:", y.nunique())
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTraining samples:", len(X_train))
print("Testing samples:", len(X_test))

Feature count: 35
Unique classes: 7

Training samples: 280545
Testing samples: 70137


In [46]:
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
)
from sklearn.preprocessing import label_binarize
import joblib
import os

In [59]:
dt_model = DecisionTreeClassifier(
    criterion="gini",
    max_depth=6,
    min_samples_split=5,
    min_samples_leaf=5,
    random_state=42
)

dt_model.fit(X_train, y_train)

In [60]:
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Decision Tree Test Accuracy: {accuracy:.4f}\n")
print("Classification Report:\n", classification_report(y_test, y_pred))

Decision Tree Test Accuracy: 0.8863

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.84      0.90     20835
           1       0.80      0.94      0.86     15309
           2       0.92      0.97      0.95     22722
           3       0.79      0.68      0.73      4254
           4       0.82      0.75      0.78      6956
           5       0.00      0.00      0.00         1
           6       0.54      0.63      0.58        60

    accuracy                           0.89     70137
   macro avg       0.69      0.69      0.69     70137
weighted avg       0.89      0.89      0.89     70137



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [65]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(9, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Decision Tree")
plt.xlabel("Predicted")
plt.ylabel("Actual")

# Save confusion matrix image
if not os.path.exists("dt_results"):
    os.makedirs("dt_results")
plt.savefig("dt_results/confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.close()

In [66]:
n_classes = len(np.unique(y_test))
y_test_bin = label_binarize(y_test, classes=np.arange(n_classes))
y_pred_bin = label_binarize(y_pred, classes=np.arange(n_classes))

try:
    roc_auc = roc_auc_score(y_test_bin, y_pred_bin, average="macro")
    print(f"ROC-AUC (macro): {roc_auc:.4f}")
except Exception as e:
    print("ROC-AUC could not be computed:", e)
    roc_auc = None

# Plot and save ROC curve
fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_pred_bin.ravel())
roc_auc_macro = auc(fpr, tpr)
plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, color="darkorange", lw=2,
         label=f"ROC curve (area = {roc_auc_macro:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Decision Tree")
plt.legend(loc="lower right")
plt.savefig("dt_results/roc_curve.png", dpi=300, bbox_inches="tight")
plt.close()

ROC-AUC (macro): 0.8337


In [67]:
joblib.dump(dt_model, "dt_results/decision_tree_model.pkl")

# Save encoder and scaler (IMPORTANT for reuse)
joblib.dump(le_unified, "dt_results/label_encoder.pkl")
joblib.dump(scaler, "dt_results/scaler.pkl")

# Save reports
pd.DataFrame(cm).to_csv("dt_results/confusion_matrix.csv", index=False)
report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report).to_csv("dt_results/classification_report.csv")

# Save accuracy summary
with open("dt_results/metrics_summary.txt", "w") as f:
    f.write(f"Decision Tree Test Accuracy: {acc:.4f}\n")
    if roc_auc:
        f.write(f"ROC-AUC (macro): {roc_auc:.4f}\n")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [68]:
from google.colab import files
import shutil

shutil.make_archive("dt_results", "zip", "dt_results")
files.download("dt_results.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>