In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import warnings
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree  import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import itertools
warnings.filterwarnings("ignore")
from xgboost import XGBClassifier
from tabulate import tabulate



In [2]:
df = pd.read_csv('./kddcup99_csv.csv')

In [3]:
df['label'].value_counts()

smurf              280790
neptune            107201
normal              97277
back                 2203
satan                1589
ipsweep              1247
portsweep            1040
warezclient          1020
teardrop              979
pod                   264
nmap                  231
guess_passwd           53
buffer_overflow        30
land                   21
warezmaster            20
imap                   12
rootkit                10
loadmodule              9
ftp_write               8
multihop                7
phf                     4
perl                    3
spy                     2
Name: label, dtype: int64

In [4]:
df = df[df.label.isin(['back','satan',  'ipsweep','portsweep', 'warezclient', 'teardrop' ])]

In [5]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
19286,0,udp,private,SF,28,0,0,1,0,0,...,1,0.01,0.05,0.01,0.0,0.0,0.0,0.0,0.0,teardrop
19287,0,udp,private,SF,28,0,0,3,0,0,...,2,0.03,0.05,0.03,0.0,0.0,0.0,0.0,0.0,teardrop
19288,0,udp,private,SF,28,0,0,3,0,0,...,3,0.04,0.05,0.04,0.0,0.0,0.0,0.0,0.0,teardrop
19289,0,udp,private,SF,28,0,0,3,0,0,...,4,0.05,0.05,0.05,0.0,0.0,0.0,0.0,0.0,teardrop
19290,0,udp,private,SF,28,0,0,3,0,0,...,5,0.06,0.05,0.06,0.0,0.0,0.0,0.0,0.0,teardrop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490959,0,udp,private,SF,28,0,0,3,0,0,...,96,0.38,0.01,0.38,0.0,0.0,0.0,0.0,0.0,teardrop
490960,0,udp,private,SF,28,0,0,3,0,0,...,97,0.38,0.01,0.38,0.0,0.0,0.0,0.0,0.0,teardrop
490961,0,udp,private,SF,28,0,0,3,0,0,...,98,0.38,0.01,0.38,0.0,0.0,0.0,0.0,0.0,teardrop
490962,0,udp,private,SF,28,0,0,3,0,0,...,99,0.39,0.01,0.39,0.0,0.0,0.0,0.0,0.0,teardrop


In [6]:
label_encoder = LabelEncoder()
def labelEncode(df):
    for col in df.columns:
        if df[col].dtype == 'object':
                df[col] = label_encoder.fit_transform(df[col])

labelEncode(df)

In [7]:
df['label'].value_counts()

0    2203
3    1589
1    1247
2    1040
5    1020
4     979
Name: label, dtype: int64

In [8]:
test = []
train = []

for i in df['label'].unique():
    label_size = int(len(df[df['label']==i])*0.2)
    test_data = df[df['label']==i].sample(label_size)
    test.append(test_data) 
    
df_test = pd.concat(test)
df_test.reset_index(inplace=True)
df.reset_index(inplace=True)
df_train = df[~(df['index'].isin((df_test['index'].unique())))]

df_test.drop('index', axis=1, inplace=True)
df_train.drop('index', axis=1, inplace=True)

In [9]:
X_train = df_train.drop('label', axis=1)
y_train = df_train['label']
X_test = df_test.drop('label', axis=1)
y_test = df_test['label']

In [10]:
from sklearn.metrics import accuracy_score

model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)

accuracy_score(model1.predict(X_test), y_test)

0.9993800371977681

In [11]:
model = RandomForestClassifier()

rfe = RFE(model, n_features_to_select=15)
rfe = rfe.fit(X_train, y_train)

feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), X_train.columns)]
selected_features = [v for i, v in feature_map if i==True]

selected_features

['protocol_type',
 'service',
 'src_bytes',
 'dst_bytes',
 'wrong_fragment',
 'hot',
 'logged_in',
 'lnum_compromised',
 'count',
 'rerror_rate',
 'same_srv_rate',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate']

In [12]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [13]:
from sklearn.metrics import accuracy_score

model2 = DecisionTreeClassifier()
model2.fit(X_train, y_train)

pred2 = model2.predict(X_test)
accuracy_score(pred2, y_test)

0.9981401115933044

In [14]:
accuracy_score(model2.predict(X_train), y_train)

0.99984532095901

In [15]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(pred2, y_test)


array([[440,   0,   0,   0,   0,   2],
       [  0, 249,   0,   0,   0,   0],
       [  0,   0, 208,   1,   0,   0],
       [  0,   0,   0, 316,   0,   0],
       [  0,   0,   0,   0, 195,   0],
       [  0,   0,   0,   0,   0, 202]], dtype=int64)

In [16]:
print(classification_report(pred2, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       442
           1       1.00      1.00      1.00       249
           2       1.00      1.00      1.00       209
           3       1.00      1.00      1.00       316
           4       1.00      1.00      1.00       195
           5       0.99      1.00      1.00       202

    accuracy                           1.00      1613
   macro avg       1.00      1.00      1.00      1613
weighted avg       1.00      1.00      1.00      1613

