In [None]:
# Forcing autoreload of modules so restart is not required
%load_ext autoreload
%autoreload 2

import os

os.chdir("../../")


print("Current Working Directory:  ", os.getcwd())

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  # plotting
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
train = pd.read_csv("./data/staging/NB15/train.csv")
test = pd.read_csv("./data/staging/NB15/test.csv")

In [4]:
train.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [6]:
print("Train Attack Category Counts:")
print(train['attack_cat'].value_counts())
print("\nTest Attack Category Counts:")
print(test['attack_cat'].value_counts())


Train Attack Category Counts:
attack_cat
Normal            56000
Generic           40000
Exploits          33393
Fuzzers           18184
DoS               12264
Reconnaissance    10491
Analysis           2000
Backdoor           1746
Shellcode          1133
Worms               130
Name: count, dtype: int64

Test Attack Category Counts:
attack_cat
Normal            37000
Generic           18871
Exploits          11132
Fuzzers            6062
DoS                4089
Reconnaissance     3496
Analysis            677
Backdoor            583
Shellcode           378
Worms                44
Name: count, dtype: int64


In [5]:
print(train['attack_cat'].unique())
print(test['attack_cat'].unique())

['Normal' 'Backdoor' 'Analysis' 'Fuzzers' 'Shellcode' 'Reconnaissance'
 'Exploits' 'DoS' 'Worms' 'Generic']
['Normal' 'Reconnaissance' 'Backdoor' 'DoS' 'Exploits' 'Analysis'
 'Fuzzers' 'Worms' 'Shellcode' 'Generic']


In [None]:
print(len(train))
print(len(test))

In [None]:
train = train[train['attack_cat'].isin(['Normal', 'DoS'])]
test = test[test['attack_cat'].isin(['Normal', 'DoS'])]

In [None]:
train.head()

In [None]:
print(len(train))
print(len(test))

In [None]:
print(train['attack_cat'].unique())
print(test['attack_cat'].unique())

In [None]:
X_train, Y_train = train.drop(columns=['label']), train['label']
X_test, Y_test = test.drop(columns=['label']), test['label']

In [None]:
x_train = set(X_train.columns.tolist())
x_test = set(X_test.columns.tolist())

diff_train = x_train - x_test  
diff_test = x_test - x_train

print("Columns in X_train but not in X_test:", diff_train)
print("Columns in X_test but not in X_train:", diff_test)

print(X_train.shape, X_test.shape)

In [None]:
print(train.dtypes.value_counts())
print(test.dtypes.value_counts())

In [None]:
print(train.select_dtypes(exclude=np.number).columns)
print(test.select_dtypes(exclude=np.number).columns)


In [None]:
categories_col = ['proto', 'service', 'state','attack_cat']
num_col_train = list(set(X_train.columns) - set(categories_col))
num_col_test = list(set(X_test.columns) - set(categories_col))

In [None]:
scaler = StandardScaler()
scaler.fit(X_train[num_col_train])

In [None]:
X_train[num_col_train] = scaler.transform(X_train[num_col_train])
X_test[num_col_test] = scaler.transform(X_test[num_col_test])

In [None]:
X_train.drop('attack_cat',axis=1,inplace=True)
X_test.drop('attack_cat',axis=1,inplace=True)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
service_ohe = OneHotEncoder(handle_unknown="ignore")
proto_ohe = OneHotEncoder(handle_unknown="ignore")
state_ohe = OneHotEncoder(handle_unknown="ignore")

ohe_service_train = service_ohe.fit(X_train[['service']])
ohe_proto_train = proto_ohe.fit(X_train[['proto']])
ohe_state_train = state_ohe.fit(X_train[['state']])

X_train_service = pd.DataFrame(ohe_service_train.transform(X_train[['service']]).todense(), 
							   columns=ohe_service_train.get_feature_names_out(['service']))
X_train_proto = pd.DataFrame(ohe_proto_train.transform(X_train[['proto']]).todense(), 
							 columns=ohe_proto_train.get_feature_names_out(['proto']))
X_train_state = pd.DataFrame(ohe_state_train.transform(X_train[['state']]).todense(), 
							 columns=ohe_state_train.get_feature_names_out(['state']))

X_train = X_train.drop(['service', 'proto', 'state'], axis=1)
X_train = pd.concat([X_train, X_train_service, X_train_proto, X_train_state], axis=1)

X_test_service = pd.DataFrame(ohe_service_train.transform(X_test[['service']]).todense(), 
							  columns=ohe_service_train.get_feature_names_out(['service']))
X_test_proto = pd.DataFrame(ohe_proto_train.transform(X_test[['proto']]).todense(), 
							columns=ohe_proto_train.get_feature_names_out(['proto']))
X_test_state = pd.DataFrame(ohe_state_train.transform(X_test[['state']]).todense(), 
							columns=ohe_state_train.get_feature_names_out(['state']))

X_test = X_test.drop(['service', 'proto', 'state'], axis=1)
X_test = pd.concat([X_test, X_test_service, X_test_proto, X_test_state], axis=1)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
X_train = X_train.dropna()
Y_train = Y_train.loc[X_train.index] 

X_test = X_test.dropna()
Y_test = Y_test.loc[X_test.index]

In [None]:
x_train = set(X_train.columns.tolist())
x_test = set(X_test.columns.tolist())

diff_train = x_train - x_test 
diff_test = x_test - x_train

print("Columns in X_train but not in X_test:", diff_train)
print("Columns in X_test but not in X_train:", diff_test)

print(X_train.shape, X_test.shape)

In [None]:
Y_train = Y_train.to_frame()
Y_test = Y_test.to_frame()

type(Y_test)

In [None]:
Y_train['label'].unique()

In [None]:
Y_test['label'].unique()

In [None]:
X_train.to_csv('./data/processed/NB15/binary/X_train.csv', index=False)
Y_train.to_csv('./data/processed/NB15/binary/Y_train.csv', index=False)

In [None]:
X_test.to_csv('./data/processed/NB15/binary/X_test.csv', index=False)
Y_test.to_csv('./data/processed/NB15/binary/Y_test.csv', index=False)