In [2]:
'''import kagglehub

# Download latest version
path = kagglehub.dataset_download("galaxyh/kdd-cup-1999-data")

print("Path to dataset files:", path)'''

'import kagglehub\n\n# Download latest version\npath = kagglehub.dataset_download("galaxyh/kdd-cup-1999-data")\n\nprint("Path to dataset files:", path)'

In [3]:
import pandas as pd

# Defining column names
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
    'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
    'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
    'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
    'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'label'
]

# Load data
df = pd.read_csv('dataset\kddcup.data\kddcup.data', names=column_names)

# explorating dataset
print("Shape:", df.shape)
print("Sample rows:\n", df.head())
print("Label distribution:\n", df['label'].value_counts())


  df = pd.read_csv('dataset\kddcup.data\kddcup.data', names=column_names)


Shape: (4898431, 42)
Sample rows:
    duration protocol_type service flag  src_bytes  dst_bytes  land  \
0         0           tcp    http   SF        215      45076     0   
1         0           tcp    http   SF        162       4528     0   
2         0           tcp    http   SF        236       1228     0   
3         0           tcp    http   SF        233       2032     0   
4         0           tcp    http   SF        239        486     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                   0   
1               0       0    0  ...                   1   
2               0       0    0  ...                   2   
3               0       0    0  ...                   3   
4               0       0    0  ...                   4   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                     0.0                     0.0   
1                     1.0                     0.0   
2                     1.0     

In [4]:
print("\nColumn data types:\n", df.dtypes)
print("\nUnique protocol types:", df['protocol_type'].unique())
print("Unique service types (first 10):", df['service'].unique()[:10])
print("Unique flags:", df['flag'].unique())


Column data types:
 duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_

In [5]:
# Count label frequencies
print("\nLabel distribution:\n", df['label'].value_counts())

# Check how many are normal vs attack
df['is_attack'] = df['label'].apply(lambda x: 0 if x == 'normal.' else 1)
print("\nAttack vs Normal:\n", df['is_attack'].value_counts())

# (Optional) View top 10 attack types by frequency
attack_counts = df[df['label'] != 'normal.']['label'].value_counts()
print("\nTop 10 attack types:\n", attack_counts.head(10))



Label distribution:
 label
smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: count, dtype: int64

Attack vs Normal:
 is_attack
1    3925650
0     972781
Name: count, dtype: int64

Top 10 attack types:
 label
smurf.          2807886
neptune.        1072017
satan.            15892
ipsweep.          12481
portsweep.        10413
nmap.              2316
back.              2203
warezclient.       1020

In [6]:
# Load attack type mapping (if available)
attack_map = pd.read_csv('dataset/training_attack_types', sep=' ', names=['attack_type', 'category'], index_col=0)

# Map labels to broader categories
df['attack_category'] = df['label'].apply(lambda x: 'normal' if x == 'normal.' else attack_map.loc[x[:-1], 'category'])
print("\nAttack category distribution:\n", df['attack_category'].value_counts())



Attack category distribution:
 attack_category
dos       3883370
normal     972781
probe       41102
r2l          1126
u2r            52
Name: count, dtype: int64


##### Shape: 4,898,431 rows × 42 columns.
##### Identified 3 categorical features (protocol_type, service, flag) and 38 numerical ones.
##### Explored label distribution:    normal.: ~972K rows    Attacks: ~3.93M rows (dominant: smurf., neptune.)
##### Added a binary is_attack flag for future evaluation.

## Step 3 Preprocessing for Unsupervised learning

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

# Drop constant column
df.drop('num_outbound_cmds', axis=1, inplace=True)

# Ensure 'is_attack' doesn't leak into feature matrix
X_raw = df.drop(columns=['label', 'is_attack']) if 'is_attack' in df.columns else df.drop(columns=['label'])
y = df['label']

# Define categorical and numerical columns
categorical_cols = ['protocol_type', 'service', 'flag']
numerical_cols = X_raw.columns.difference(categorical_cols)

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(X_raw[categorical_cols])

# Standardize numerical features
scaler = StandardScaler()
X_num = scaler.fit_transform(X_raw[numerical_cols])

# Combine processed features
X_processed = np.hstack((X_num, X_cat))

print("Final processed feature shape:", X_processed.shape)


ValueError: could not convert string to float: 'normal'

##### Dropping num_outbound_cmds because it has the same value (i.e. 0) for all rows and carries no useful information for learning.
##### Column isattack removed for training
##### Saving the preprocessed data

In [None]:
np.save("dataset/X_processed.npy", X_processed)
np.save("dataset/y_true.npy", df['is_attack'].values)


In [None]:
import joblib
joblib.dump(encoder, "encoder.joblib")
joblib.dump(scaler, "scaler.joblib")


NameError: name 'encoder' is not defined