In [None]:
# Forcing autoreload of modules so restart is not required
%load_ext autoreload
%autoreload 2

import os

os.chdir("../../")


print("Current Working Directory:  ", os.getcwd())

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  # plotting
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE

In [None]:
train = pd.read_csv("./data/staging/NB15/train.csv")
test = pd.read_csv("./data/staging/NB15/test.csv")

In [4]:
train.head(1000)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,,FIN,6,4,258,172,74.087490,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,2.731069,tcp,,FIN,16,18,1540,1644,12.083180,...,1,4,0,0,0,21,6,0,Normal,0
996,997,2.921560,tcp,,FIN,30,42,2302,34406,24.302085,...,1,9,0,0,0,21,11,0,Normal,0
997,998,0.223437,tcp,ftp-data,FIN,8,12,424,8824,85.035157,...,1,3,0,0,0,5,2,0,Normal,0
998,999,3.089736,tcp,,FIN,30,42,2302,34406,22.979310,...,1,2,0,0,0,9,5,0,Normal,0


In [5]:
print("Train Attack Category Counts:")
print(train['attack_cat'].value_counts())
print("\nTest Attack Category Counts:")
print(test['attack_cat'].value_counts())

Train Attack Category Counts:
attack_cat
Normal            56000
Generic           40000
Exploits          33393
Fuzzers           18184
DoS               12264
Reconnaissance    10491
Analysis           2000
Backdoor           1746
Shellcode          1133
Worms               130
Name: count, dtype: int64

Test Attack Category Counts:
attack_cat
Normal            37000
Generic           18871
Exploits          11132
Fuzzers            6062
DoS                4089
Reconnaissance     3496
Analysis            677
Backdoor            583
Shellcode           378
Worms                44
Name: count, dtype: int64


In [6]:
print(train['attack_cat'].unique())
print(test['attack_cat'].unique())

['Normal' 'Backdoor' 'Analysis' 'Fuzzers' 'Shellcode' 'Reconnaissance'
 'Exploits' 'DoS' 'Worms' 'Generic']
['Normal' 'Reconnaissance' 'Backdoor' 'DoS' 'Exploits' 'Analysis'
 'Fuzzers' 'Worms' 'Shellcode' 'Generic']


In [7]:
print(len(train))
print(len(test))

175341
82332


In [8]:
train = train[train['attack_cat'].isin(['Normal', 'DoS','Fuzzers'])]
test = test[test['attack_cat'].isin(['Normal', 'DoS','Fuzzers'])]

In [9]:
print("Train Attack Category Counts:")
print(train['attack_cat'].value_counts())
print("\nTest Attack Category Counts:")
print(test['attack_cat'].value_counts())

Train Attack Category Counts:
attack_cat
Normal     56000
Fuzzers    18184
DoS        12264
Name: count, dtype: int64

Test Attack Category Counts:
attack_cat
Normal     37000
Fuzzers     6062
DoS         4089
Name: count, dtype: int64


In [10]:
train.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [11]:
print(len(train))
print(len(test))

86448
47151


In [12]:
print(train['attack_cat'].unique())
print(test['attack_cat'].unique())

['Normal' 'Fuzzers' 'DoS']
['Normal' 'DoS' 'Fuzzers']


In [15]:
print("Train Attack Category Counts:")
print(train['label'].value_counts())
print("\nTest Attack Category Counts:")
print(test['label'].value_counts())

Train Attack Category Counts:
label
0    56000
1    30448
Name: count, dtype: int64

Test Attack Category Counts:
label
0    37000
1    10151
Name: count, dtype: int64


In [16]:
X_train, Y_train = train.drop(columns=['label']), train['label']
X_test, Y_test = test.drop(columns=['label']), test['label']

In [None]:
x_train = set(X_train.columns.tolist())
x_test = set(X_test.columns.tolist())

diff_train = x_train - x_test  
diff_test = x_test - x_train

print("Columns in X_train but not in X_test:", diff_train)
print("Columns in X_test but not in X_train:", diff_test)

print(X_train.shape, X_test.shape)

Columns in X_train but not in X_test: set()
Columns in X_test but not in X_train: set()
(86448, 44) (47151, 44)


In [18]:
print(train.dtypes.value_counts())
print(test.dtypes.value_counts())

int64      30
float64    11
object      4
Name: count, dtype: int64
int64      30
float64    11
object      4
Name: count, dtype: int64


In [19]:
print(train.select_dtypes(exclude=np.number).columns)
print(test.select_dtypes(exclude=np.number).columns)


Index(['proto', 'service', 'state', 'attack_cat'], dtype='object')
Index(['proto', 'service', 'state', 'attack_cat'], dtype='object')


In [20]:
categories_col = ['proto', 'service', 'state','attack_cat']
num_col_train = list(set(X_train.columns) - set(categories_col))
num_col_test = list(set(X_test.columns) - set(categories_col))

In [21]:
scaler = StandardScaler()
scaler.fit(X_train[num_col_train])

In [None]:
X_train[num_col_train] = scaler.transform(X_train[num_col_train])
X_test[num_col_test] = scaler.transform(X_test[num_col_test])

In [23]:
X_train.drop('attack_cat',axis=1,inplace=True)
X_test.drop('attack_cat',axis=1,inplace=True)

In [24]:
X_train.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,-1.331634,-0.208277,tcp,,FIN,-0.201094,-0.181775,-0.055994,-0.127131,-0.36637,...,-0.657964,-0.35676,-0.190339,-0.606585,-0.11337,-0.113063,-0.231517,-0.679043,-0.775024,-0.181671
1,-1.33161,-0.1351,tcp,,FIN,-0.121187,0.069376,-0.05182,0.106771,-0.366334,...,-0.657964,-0.35676,-0.190339,-0.417891,-0.11337,-0.113063,-0.231517,-0.679043,0.143058,-0.181671
2,-1.331586,-0.000327,tcp,,FIN,-0.181117,-0.093134,-0.055065,-0.054381,-0.366863,...,-0.349394,-0.35676,-0.190339,-0.229197,-0.11337,-0.113063,-0.231517,-0.44551,0.143058,-0.181671
3,-1.331562,0.007776,tcp,ftp,FIN,-0.141164,-0.122681,-0.052749,-0.123788,-0.366867,...,-0.349394,-0.35676,-0.190339,-0.229197,8.820658,8.748254,-0.231517,-0.44551,-0.775024,-0.181671
4,-1.331538,-0.162859,tcp,,FIN,-0.161141,-0.167001,-0.053574,-0.126594,-0.366705,...,-0.349394,0.04682,-0.190339,6.752483,-0.11337,-0.113063,-0.231517,-0.44551,6.202401,-0.181671


In [25]:
X_test.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports
0,-1.331634,-0.225098,udp,,INT,-0.241048,-0.211322,-0.053907,-0.128092,0.38145,...,-0.657964,-0.35676,-0.190339,-0.417891,-0.11337,-0.113063,-0.231517,-0.679043,-0.591407,-0.181671
1,-1.33161,-0.225098,udp,,INT,-0.241048,-0.211322,-0.042804,-0.128092,0.662111,...,-0.657964,-0.35676,-0.190339,-0.417891,-0.11337,-0.113063,-0.231517,-0.679043,-0.591407,-0.181671
2,-1.331586,-0.225099,udp,,INT,-0.241048,-0.211322,-0.048891,-0.128092,1.279566,...,-0.657964,-0.35676,-0.190339,-0.229197,-0.11337,-0.113063,-0.231517,-0.679043,-0.407791,-0.181671
3,-1.331562,-0.225099,udp,,INT,-0.241048,-0.211322,-0.050364,-0.128092,1.005142,...,-0.349394,0.04682,-0.190339,-0.229197,-0.11337,-0.113063,-0.231517,-0.44551,-0.407791,-0.181671
4,-1.331538,-0.225098,udp,,INT,-0.241048,-0.211322,-0.039612,-0.128092,0.456293,...,-0.349394,0.04682,-0.190339,-0.229197,-0.11337,-0.113063,-0.231517,-0.44551,-0.407791,-0.181671


In [None]:
service_ohe = OneHotEncoder(handle_unknown="ignore")
proto_ohe = OneHotEncoder(handle_unknown="ignore")
state_ohe = OneHotEncoder(handle_unknown="ignore")

ohe_service_train = service_ohe.fit(X_train[['service']])
ohe_proto_train = proto_ohe.fit(X_train[['proto']])
ohe_state_train = state_ohe.fit(X_train[['state']])

X_train_service = pd.DataFrame(ohe_service_train.transform(X_train[['service']]).todense(), 
							   columns=ohe_service_train.get_feature_names_out(['service']))
X_train_proto = pd.DataFrame(ohe_proto_train.transform(X_train[['proto']]).todense(), 
							 columns=ohe_proto_train.get_feature_names_out(['proto']))
X_train_state = pd.DataFrame(ohe_state_train.transform(X_train[['state']]).todense(), 
							 columns=ohe_state_train.get_feature_names_out(['state']))

X_train = X_train.drop(['service', 'proto', 'state'], axis=1)
X_train = pd.concat([X_train, X_train_service, X_train_proto, X_train_state], axis=1)

X_test_service = pd.DataFrame(ohe_service_train.transform(X_test[['service']]).todense(), 
							  columns=ohe_service_train.get_feature_names_out(['service']))
X_test_proto = pd.DataFrame(ohe_proto_train.transform(X_test[['proto']]).todense(), 
							columns=ohe_proto_train.get_feature_names_out(['proto']))
X_test_state = pd.DataFrame(ohe_state_train.transform(X_test[['state']]).todense(), 
							columns=ohe_state_train.get_feature_names_out(['state']))

X_test = X_test.drop(['service', 'proto', 'state'], axis=1)
X_test = pd.concat([X_test, X_test_service, X_test_proto, X_test_state], axis=1)

In [27]:
X_train.head()

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,proto_zero,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no
0,-1.331634,-0.208277,-0.201094,-0.181775,-0.055994,-0.127131,-0.36637,1.049279,1.631621,-0.266845,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.33161,-0.1351,-0.121187,0.069376,-0.05182,0.106771,-0.366334,-0.657155,1.612563,-0.26687,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.331586,-0.000327,-0.181117,-0.093134,-0.055065,-0.054381,-0.366863,-0.657155,1.612563,-0.2669,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.331562,0.007776,-0.141164,-0.122681,-0.052749,-0.123788,-0.366867,-0.657155,1.612563,-0.266895,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.331538,-0.162859,-0.161141,-0.167001,-0.053574,-0.126594,-0.366705,1.067242,1.612563,-0.266869,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
X_test.head()

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,proto_zero,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no
0,-1.331634,-0.225098,-0.241048,-0.211322,-0.053907,-0.128092,0.38145,1.067242,-0.788756,0.52045,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-1.33161,-0.225098,-0.241048,-0.211322,-0.042804,-0.128092,0.662111,1.067242,-0.788756,3.578995,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-1.331586,-0.225099,-0.241048,-0.211322,-0.048891,-0.128092,1.279566,1.067242,-0.788756,3.462876,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-1.331562,-0.225099,-0.241048,-0.211322,-0.050364,-0.128092,1.005142,1.067242,-0.788756,2.352323,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-1.331538,-0.225098,-0.241048,-0.211322,-0.039612,-0.128092,0.456293,1.067242,-0.788756,3.445415,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X_train = X_train.dropna()
Y_train = Y_train.loc[X_train.index] 

X_test = X_test.dropna()
Y_test = Y_test.loc[X_test.index]

In [None]:
x_train = set(X_train.columns.tolist())
x_test = set(X_test.columns.tolist())

diff_train = x_train - x_test
diff_test = x_test - x_train

print("Columns in X_train but not in X_test:", diff_train)
print("Columns in X_test but not in X_train:", diff_test)

print(X_train.shape, X_test.shape) 

Columns in X_train but not in X_test: set()
Columns in X_test but not in X_train: set()
(64532, 195) (25928, 195)


In [None]:
smote = SMOTE(sampling_strategy={1: 50000}, random_state=42)
X_train, Y_train = smote.fit_resample(X_train, Y_train)

print("After SMOTE:")
print(pd.Series(Y_train).value_counts())

After SMOTE:
label
0    51010
1    50000
Name: count, dtype: int64


In [32]:
Y_train = Y_train.to_frame()
Y_test = Y_test.to_frame()

type(Y_test)

pandas.core.frame.DataFrame

In [34]:
print("Train Attack Category Counts:")
print(Y_train['label'].value_counts())
print("\nTest Attack Category Counts:")
print(Y_test['label'].value_counts())

Train Attack Category Counts:
label
0    51010
1    50000
Name: count, dtype: int64

Test Attack Category Counts:
label
0    20520
1     5408
Name: count, dtype: int64


In [35]:
Y_train['label'].unique()

array([0, 1])

In [36]:
Y_test['label'].unique()

array([0, 1])

In [37]:
X_train.to_csv('./data/processed/NB15/binary_comb/X_train.csv', index=False)
Y_train.to_csv('./data/processed/NB15/binary_comb/Y_train.csv', index=False)

In [38]:
X_test.to_csv('./data/processed/NB15/binary_comb/X_test.csv', index=False)
Y_test.to_csv('./data/processed/NB15/binary_comb/Y_test.csv', index=False)

In [39]:
print("Train Attack Category Counts:")
print(Y_train['label'].value_counts())
print("\nTest Attack Category Counts:")
print(Y_test['label'].value_counts())

Train Attack Category Counts:
label
0    51010
1    50000
Name: count, dtype: int64

Test Attack Category Counts:
label
0    20520
1     5408
Name: count, dtype: int64
