In [1]:
import pandas as pd
import numpy as np
# import utility

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.base import clone
import time
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score, auc, recall_score, precision_score, f1_score

from sklearn.preprocessing import StandardScaler


In [2]:
import os
import  sys
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
grandparent_dir = os.path.dirname(parent_dir)
sys.path.append(grandparent_dir)

sys.path.append(parent_dir)

import utils.utility as utility


In [None]:
data_dir = 'path/to/HDMLFS/data_folder/'

In [4]:
dataset_train_paths = data_dir + 'KDDTrain+.txt'
dataset_test_paths = data_dir + 'KDDTest+.txt'

In [5]:
#Loading -datasets into dataframe
df_train = pd.read_csv(dataset_train_paths, header=None)
df_test = pd.read_csv(dataset_test_paths, header=None)

In [6]:
df_train.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15


In [7]:
df_test.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


In [8]:
df_train.shape, df_test.shape

((125973, 43), (22544, 43))

In [9]:
#Reset column names for training set
df_train.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
'num_access_files', 'num_outbound_cmds', 'is_host_login',
'is_guest_login', 'count', 'srv_count', 'serror_rate',
'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
'dst_host_srv_count', 'dst_host_same_srv_rate','dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
'dst_host_srv_rerror_rate', 'attack', 'outcome']


In [10]:
df_train.head(2)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,outcome
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15


In [11]:
#Reset column names for test set
df_test.columns = df_train.columns

In [12]:
df_test.head(2)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,outcome
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


In [13]:
df_train.outcome.nunique()

22

In [14]:
df_test.outcome.nunique()

22

In [15]:
# Combining the training set and test set for the purpose of preprocessing
# Before concating, add a column to each of the training and test set to distinguish them later 
df_train[ 'set_type'] = 0
df_test['set_type'] = 1


In [16]:
# concatenating them 
df = pd.concat((df_train.iloc[:,:], df_test.iloc[:,:]))

In [17]:
# Resetting the index 
df = df.reset_index(drop=True)

In [18]:
df_sf = utility.preprocess_dataset(df, 'attack')

The features that were removed for having zero variance are ['num_outbound_cmds']
Common attack types are: ['normal', 'neptune', 'satan', 'ipsweep', 'smurf', 'portsweep', 'nmap', 'back', 'guess_passwd', 'mscan', 'warezmaster', 'teardrop', 'warezclient', 'apache2', 'processtable', 'snmpguess', 'saint', 'mailbomb', 'pod']
The number of uncommon attacks types are 21. 
 And they are : ['perl', 'worm', 'udpstorm', 'multihop', 'ps', 'sendmail', 'httptunnel', 'ftp_write', 'loadmodule', 'sqlattack', 'xterm', 'land', 'imap', 'spy', 'snmpgetattack', 'buffer_overflow', 'xlock', 'rootkit', 'named', 'xsnoop', 'phf']
The number of rows removed because of uncommon attacks is 559
Correlated columns (threshold > 0.9): [{'num_root': ['num_compromised']}, {'srv_serror_rate': ['serror_rate']}, {'srv_rerror_rate': ['rerror_rate']}, {'dst_host_serror_rate': ['serror_rate', 'srv_serror_rate']}, {'dst_host_srv_serror_rate': ['serror_rate', 'srv_serror_rate', 'dst_host_serror_rate']}, {'dst_host_rerror_rate': 

In [19]:
# Saving the combined data into a file
df.to_csv(data_dir+'preprocessed_featureselection/01fulltraintest2.csv', index=False)