In [2]:
import os
from pprint import pprint
from pathlib import Path
import pandas as pd
import kagglehub
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Download latest version
dataset_path = kagglehub.dataset_download("chethuhn/network-intrusion-dataset")

print("Path to dataset files:", dataset_path)

Path to dataset files: C:\Users\KarlSchmidt\.cache\kagglehub\datasets\chethuhn\network-intrusion-dataset\versions\1


In [4]:
items = os.listdir(dataset_path)

pprint(items)

['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
 'Friday-WorkingHours-Morning.pcap_ISCX.csv',
 'Monday-WorkingHours.pcap_ISCX.csv',
 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
 'Tuesday-WorkingHours.pcap_ISCX.csv',
 'Wednesday-workingHours.pcap_ISCX.csv']


In [5]:
dataset_path = Path(dataset_path)

csv_path = dataset_path / items[-1] # use the last dataset ('Wednesday-workingHours.pcap_ISCX.csv')

# 1. Dataset Understanding and Preparation

## Load data to Pandas dataframe

In [6]:
df = pd.read_csv(csv_path)

df.head(20)

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,38308,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,88,1095,10,6,3150,3150,1575,0,315.0,632.561635,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,389,15206,17,12,3452,6660,1313,0,203.058823,425.778474,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,88,1092,9,6,3150,3152,1575,0,350.0,694.509719,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
5,389,433,11,4,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
6,88,1088,9,6,3150,3152,1575,0,350.0,694.509719,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
7,80,579225,132,150,160,320799,160,0,1.212121,13.926212,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
8,49666,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
9,49413,4,3,0,18,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [7]:
df.describe()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,...,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0,692703.0
mean,5686.869462,28001680.0,9.556261,10.214079,555.093,16996.44,233.593936,15.022183,60.55544,82.895863,...,6.121279,26.761141,92244.78,47608.52,162736.3,63151.86,22111220.0,474374.4,22521740.0,21733730.0
std,15727.42356,42766800.0,747.197814,984.204633,6163.663,2241175.0,603.751856,51.068835,157.643794,226.126084,...,715.155068,6.322368,700704.9,474208.1,1094616.0,605102.3,38124150.0,4488512.0,38482920.0,38077250.0
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,201.0,2.0,1.0,12.0,0.0,6.0,0.0,6.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,80.0,61437.0,2.0,2.0,82.0,188.0,46.0,0.0,41.0,0.0,...,1.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,443.0,83024370.0,7.0,6.0,365.0,11595.0,341.0,32.0,56.666667,128.916917,...,2.0,32.0,991.0,0.0,991.0,988.0,15900000.0,0.0,16500000.0,10000000.0
max,65487.0,120000000.0,203943.0,272353.0,1224076.0,627000000.0,24820.0,2065.0,4640.757576,6429.190773,...,197124.0,60.0,100000000.0,74200000.0,105000000.0,100000000.0,120000000.0,76900000.0,120000000.0,120000000.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692703 entries, 0 to 692702
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             692703 non-null  int64  
 1    Flow Duration                692703 non-null  int64  
 2    Total Fwd Packets            692703 non-null  int64  
 3    Total Backward Packets       692703 non-null  int64  
 4   Total Length of Fwd Packets   692703 non-null  int64  
 5    Total Length of Bwd Packets  692703 non-null  int64  
 6    Fwd Packet Length Max        692703 non-null  int64  
 7    Fwd Packet Length Min        692703 non-null  int64  
 8    Fwd Packet Length Mean       692703 non-null  float64
 9    Fwd Packet Length Std        692703 non-null  float64
 10  Bwd Packet Length Max         692703 non-null  int64  
 11   Bwd Packet Length Min        692703 non-null  int64  
 12   Bwd Packet Length Mean       692703 non-nul

## Select and clean features

In [9]:
df = df.dropna()  # Remove missing values
X = df.select_dtypes(include=['float64', 'int64'])  # Select numeric columns
X = X.drop(columns=['Flow ID', 'Source IP', 'Destination IP', 'Timestamp'], errors='ignore')

## Normalize features

In [14]:
# Replace infinity values with NaN and drop rows with NaN
X.replace([float('inf'), -float('inf')], float('nan'), inplace=True)
X.dropna(inplace=True)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2. Apply Clustering Algorithms

## k-Means Clustering

In [15]:
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)

In [None]:
for k in range(2, 8):
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    print(f"k={k}, silhouette score={score:.4f}")

## DBSCAN Clustering

In [None]:
dbscan = DBSCAN(eps=1.5, min_samples=10)
dbscan_labels = dbscan.fit_predict(X_scaled)

In [None]:
score = silhouette_score(X_scaled, dbscan_labels)
print("Silhouette Score (DBSCAN):", score)

# 3. Evaluate and Interpret

In [None]:
df['Cluster_KMeans'] = kmeans_labels
df['Cluster_DBSCAN'] = dbscan_labels
df['Label'] = df['Label'].astype('category')

print(df.groupby(['Cluster_KMeans', 'Label']).size())
print(df.groupby(['Cluster_DBSCAN', 'Label']).size())

# 4. Visualization

In [None]:
pca = PCA(n_components=2)
reduced = pca.fit_transform(X_scaled)

plt.scatter(reduced[:, 0], reduced[:, 1], c=kmeans_labels, cmap='viridis', s=1)
plt.title("K-Means Clustering Visualization (PCA-reduced)")
plt.show()