In [72]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import os


In [73]:
# Column names for NSL-KDD
col_names = ["duration", "protocol_type", "service", "flag", "src_bytes",
             "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
             "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
             "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
             "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
             "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
             "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
             "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
             "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]

train_df = pd.read_csv("../data/raw/KDDTrain+.txt", header=None, names=col_names + ["difficulty"])
test_df  = pd.read_csv("../data/raw/KDDTest+.txt",  header=None, names=col_names + ["difficulty"])

# Drop the extra column
train_df = train_df.drop(columns=["difficulty"])
test_df = test_df.drop(columns=["difficulty"])

train_df.head(), test_df.head()

(   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
 0         0           tcp  ftp_data   SF        491          0     0   
 1         0           udp     other   SF        146          0     0   
 2         0           tcp   private   S0          0          0     0   
 3         0           tcp      http   SF        232       8153     0   
 4         0           tcp      http   SF        199        420     0   
 
    wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
 0               0       0    0  ...                  25   
 1               0       0    0  ...                   1   
 2               0       0    0  ...                  26   
 3               0       0    0  ...                 255   
 4               0       0    0  ...                 255   
 
    dst_host_same_srv_rate  dst_host_diff_srv_rate  \
 0                    0.17                    0.03   
 1                    0.00                    0.60   
 2                    0.10          

In [74]:
# colums that are categorical and not binary yet: protocol_type (column 2), service (column 3), flag (column 4).
# explore categorical features
print('Training set:')
for col_name in train_df.columns:
    if train_df[col_name].dtypes == 'object' :
        unique_cat = len(train_df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories


In [75]:
# Test set
print('Test set:')
for col_name in test_df.columns:
    if test_df[col_name].dtypes == 'object':
        unique_cat = len(test_df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


In [76]:
# convert protocol_type, service and flag from text to numeric values.

cat_features = ["protocol_type", "service", "flag"]

encoders = {}

for col in cat_features:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col]  = le.transform(test_df[col])
    encoders[col] = le

train_df[cat_features].head()


Unnamed: 0,protocol_type,service,flag
0,1,20,9
1,2,44,9
2,1,49,5
3,1,24,9
4,1,24,9


In [77]:
#convert label: if normal --> 0, attack --> 1
train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 'normal' else 1)
test_df['label']  = test_df['label'].apply(lambda x: 0 if x == 'normal' else 1)

train_df['label'].value_counts(), test_df['label'].value_counts()


(label
 0    67343
 1    58630
 Name: count, dtype: int64,
 label
 1    12833
 0     9711
 Name: count, dtype: int64)

In [78]:
#Split into X and y
X_train = train_df.drop(columns=["label"])
y_train = train_df["label"]

X_test = test_df.drop(columns=["label"])
y_test = test_df["label"]


In [79]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [80]:
os.makedirs("../data/processed", exist_ok=True)

np.save("../data/processed/X_train.npy", X_train_scaled)
np.save("../data/processed/X_test.npy", X_test_scaled)
np.save("../data/processed/y_train.npy", y_train)
np.save("../data/processed/y_test.npy", y_test)

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)

print("Processed data saved!")

X_train: (125973, 41)
X_test : (22544, 41)
y_train: (125973,)
y_test : (22544,)
Processed data saved!


In [81]:
df_check = pd.DataFrame(X_train)
print(df_check.dtypes)

duration                         int64
protocol_type                    int64
service                          int64
flag                             int64
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate          

In [82]:
print("Scaled Min:", X_train_scaled.min())
print("Scaled Max:", X_train_scaled.max())

Scaled Min: 0.0
Scaled Max: 1.0
