# Analysis and Classification of Attacks using Realistic Botnet Dataset

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Training Dataset
data = pd.read_csv("UNSW_2018_IoT_Botnet_Final_10_best_Training.csv")
data.head()

Unnamed: 0,pkSeqID,proto,saddr,sport,daddr,dport,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack,category,subcategory
0,3142762,udp,192.168.100.150,6551,192.168.100.3,80,251984,1.900363,100,0.0,4,2.687519,100,0.0,0.494549,4.031619,1,DDoS,UDP
1,2432264,tcp,192.168.100.150,5532,192.168.100.3,80,256724,0.078003,38,3.85693,3,3.934927,100,0.0,0.256493,4.012924,1,DDoS,TCP
2,1976315,tcp,192.168.100.147,27165,192.168.100.3,80,62921,0.268666,100,2.9741,3,3.341429,100,0.0,0.29488,3.609205,1,DDoS,TCP
3,1240757,udp,192.168.100.150,48719,192.168.100.3,80,99168,1.823185,63,0.0,4,3.222832,63,0.0,0.461435,4.942302,1,DoS,UDP
4,3257991,udp,192.168.100.147,22461,192.168.100.3,80,105063,0.822418,100,2.979995,4,3.983222,100,0.0,1.002999,4.994452,1,DDoS,UDP


In [4]:
# General Information about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2934817 entries, 0 to 2934816
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   pkSeqID            int64  
 1   proto              object 
 2   saddr              object 
 3   sport              object 
 4   daddr              object 
 5   dport              object 
 6   seq                int64  
 7   stddev             float64
 8   N_IN_Conn_P_SrcIP  int64  
 9   min                float64
 10  state_number       int64  
 11  mean               float64
 12  N_IN_Conn_P_DstIP  int64  
 13  drate              float64
 14  srate              float64
 15  max                float64
 16  attack             int64  
 17  category           object 
 18  subcategory        object 
dtypes: float64(6), int64(6), object(7)
memory usage: 425.4+ MB


## Exploratory Data Analysis

In [5]:
data.describe()

Unnamed: 0,pkSeqID,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack
count,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0
mean,1834209.0,121297.3,0.8869639,82.54997,1.017208,3.134219,2.230471,92.45766,0.4303064,3.12829,3.019269,0.9998739
std,1059058.0,75787.0,0.8036391,24.39019,1.483551,1.187107,1.517766,18.16651,56.23304,784.5494,1.860915,0.0112275
min,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,917109.0,54847.0,0.029997,69.0,0.0,3.0,0.181934,100.0,0.0,0.155845,0.280417,1.0
50%,1834316.0,117737.0,0.792575,100.0,0.0,4.0,2.689973,100.0,0.0,0.28378,4.008429,1.0
75%,2751250.0,184870.0,1.74522,100.0,2.147949,4.0,3.565061,100.0,0.0,0.488,4.292426,1.0
max,3668522.0,262211.0,2.496763,100.0,4.980471,11.0,4.981882,100.0,58823.53,1000000.0,4.999999,1.0


In [6]:
data["proto"].value_counts()

udp          1596819
tcp          1330598
icmp            7228
arp              166
ipv6-icmp          6
Name: proto, dtype: int64

In [7]:
data["saddr"].value_counts()

192.168.100.147              761360
192.168.100.148              738642
192.168.100.150              712260
192.168.100.149              711466
192.168.100.3                  6609
192.168.100.5                  4107
192.168.100.6                   272
192.168.100.7                    34
192.168.100.4                    17
192.168.100.1                    14
192.168.100.27                    9
192.168.100.46                    8
fe80::250:56ff:febe:254           5
192.168.100.55                    3
fe80::c0c0:aa20:45b9:bdd9         2
fe80::250:56ff:febe:89ee          2
fe80::250:56ff:febe:26db          2
fe80::2c6a:ff9b:7e14:166a         2
fe80::250:56ff:febe:c038          2
fe80::250:56ff:febe:e9d9          1
Name: saddr, dtype: int64

In [8]:
data["sport"].value_counts()

0x0303    7156
80        3220
1822       878
60541      869
1216       868
          ... 
27738       31
18992       30
39305       30
0x000d      10
0x0011       8
Name: sport, Length: 65541, dtype: int64

In [9]:
data["daddr"].value_counts()

192.168.100.3      1900562
192.168.100.5       361192
192.168.100.7       332161
192.168.100.6       329679
192.168.100.150       3040
                    ...   
192.33.14.30             1
52.35.35.13              1
216.239.38.10            1
205.251.194.167          1
205.251.194.154          1
Name: daddr, Length: 81, dtype: int64

In [10]:
data["dport"].value_counts()

80      2858794
1          5379
3306       3757
53          275
-1          166
         ...   
4520          1
7751          1
8942          1
8739          1
3764          1
Name: dport, Length: 6906, dtype: int64

In [11]:
data["category"].value_counts()

DDoS              1541315
DoS               1320148
Reconnaissance      72919
Normal                370
Theft                  65
Name: category, dtype: int64

In [12]:
data["attack"].value_counts()

1    2934447
0        370
Name: attack, dtype: int64

In [13]:
data["subcategory"].value_counts()

UDP                  1584650
TCP                  1274843
Service_Scan           58626
OS_Fingerprint         14293
HTTP                    1970
Normal                   370
Keylogging                59
Data_Exfiltration          6
Name: subcategory, dtype: int64

### Extracting 10 best features

In [14]:
ten_best_features = data[['seq','stddev','N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max']]
target_features = data[['attack','category','subcategory']]

## Data Preprocessing

In [15]:
# Label Encoding the target columns
le = LabelEncoder()
target_features['category'] = le.fit_transform(target_features['category'])
target_features['subcategory'] = le.fit_transform(target_features['subcategory'])

target_features.head()

Unnamed: 0,attack,category,subcategory
0,1,0,7
1,1,0,6
2,1,0,6
3,1,1,7
4,1,0,7


In [16]:
# Train-test Split
X_train, X_test, y_train, y_test = train_test_split(ten_best_features,target_features)

In [17]:
# Scaling the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Defining the ML Model Classes

In [18]:
class RandomForest:
    def __init__(self,max_depth):
        self.rfc_attack = RandomForestClassifier(max_depth=max_depth)
        self.rfc_category = RandomForestClassifier(max_depth=max_depth)
        self.rfc_subcategory = RandomForestClassifier(max_depth=max_depth)
    
    def fit(self,X_train,y_train):
        self.rfc_attack.fit(X_train,y_train['attack'])
        
        features_category = np.concatenate((X_train,np.array(y_train['attack']).reshape(-1,1)),axis=1)
        self.rfc_category.fit(features_category,y_train['category'])
        
        features_subcategory = np.concatenate((features_category,np.array(y_train['category']).reshape(-1,1)),axis=1)
        self.rfc_subcategory.fit(features_subcategory,y_train['subcategory'])
        
    def predict(self,X_test):
        predict_attack = self.rfc_attack.predict(X_test)
        
        test_category = np.concatenate((X_test,predict_attack.reshape(-1,1)),axis=1)
        predict_category = self.rfc_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category,predict_category.reshape(-1,1)),axis=1)
        predict_subcategory = self.rfc_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack':predict_attack,'category':predict_category,'subcategory':predict_subcategory})

In [19]:
class NaiveBayes:
    def __init__(self):
        self.nb_attack = GaussianNB()
        self.nb_category = GaussianNB()
        self.nb_subcategory = GaussianNB()
    
    def fit(self,X_train,y_train):
        self.nb_attack.fit(X_train,y_train['attack'])
        
        features_category = np.concatenate((X_train,np.array(y_train['attack']).reshape(-1,1)),axis=1)
        self.nb_category.fit(features_category,y_train['category'])
        
        features_subcategory = np.concatenate((features_category,np.array(y_train['category']).reshape(-1,1)),axis=1)
        self.nb_subcategory.fit(features_subcategory,y_train['subcategory'])
        
    def predict(self,X_test):
        predict_attack = self.nb_attack.predict(X_test)
        
        test_category = np.concatenate((X_test,predict_attack.reshape(-1,1)),axis=1)
        predict_category = self.nb_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category,predict_category.reshape(-1,1)),axis=1)
        predict_subcategory = self.nb_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack':predict_attack,'category':predict_category,'subcategory':predict_subcategory})

In [20]:
class DecisionTree:
    def __init__(self,criterion,max_depth=5):
        self.dtree_attack = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth)
        self.dtree_category = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth)
        self.dtree_subcategory = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth)
    
    def fit(self,X_train,y_train):
        self.dtree_attack.fit(X_train,y_train['attack'])
        
        features_category = np.concatenate((X_train,np.array(y_train['attack']).reshape(-1,1)),axis=1)
        self.dtree_category.fit(features_category,y_train['category'])
        
        features_subcategory = np.concatenate((features_category,np.array(y_train['category']).reshape(-1,1)),axis=1)
        self.dtree_subcategory.fit(features_subcategory,y_train['subcategory'])
        
    def predict(self,X_test):
        predict_attack = self.dtree_attack.predict(X_test)
        
        test_category = np.concatenate((X_test,predict_attack.reshape(-1,1)),axis=1)
        predict_category = self.dtree_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category,predict_category.reshape(-1,1)),axis=1)
        predict_subcategory = self.dtree_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack':predict_attack,'category':predict_category,'subcategory':predict_subcategory})

In [21]:
class GradientBoost:
    def __init__(self):
        self.xgb_attack = XGBClassifier()
        self.xgb_category = XGBClassifier()
        self.xgb_subcategory = XGBClassifier()
    
    def fit(self,X_train,y_train):
        self.xgb_attack.fit(X_train,y_train['attack'])
        
        features_category = np.concatenate((X_train,np.array(y_train['attack']).reshape(-1,1)),axis=1)
        self.xgb_category.fit(features_category,y_train['category'])
        
        features_subcategory = np.concatenate((features_category,np.array(y_train['category']).reshape(-1,1)),axis=1)
        self.xgb_subcategory.fit(features_subcategory,y_train['subcategory'])
        
    def predict(self,X_test):
        predict_attack = self.xgb_attack.predict(X_test)
        
        test_category = np.concatenate((X_test,predict_attack.reshape(-1,1)),axis=1)
        predict_category = self.xgb_category.predict(test_category)
        
        test_subcategory = np.concatenate((test_category,predict_category.reshape(-1,1)),axis=1)
        predict_subcategory = self.xgb_subcategory.predict(test_subcategory)
        
        return pd.DataFrame({'attack':predict_attack,'category':predict_category,'subcategory':predict_subcategory})

### Validation of ML Models on Training Dataset

**Random Forest**

In [22]:
rf = RandomForest(max_depth=3)
rf.fit(X_train,y_train)

predictions_rfc = rf.predict(X_test)

In [23]:
print(confusion_matrix(y_test['attack'],predictions_rfc['attack']))
print(classification_report(y_test['attack'],predictions_rfc['attack']))

[[     6     94]
 [     0 733605]]
              precision    recall  f1-score   support

           0       1.00      0.06      0.11       100
           1       1.00      1.00      1.00    733605

    accuracy                           1.00    733705
   macro avg       1.00      0.53      0.56    733705
weighted avg       1.00      1.00      1.00    733705



In [24]:
print(confusion_matrix(y_test['category'],predictions_rfc['category']))
print(classification_report(y_test['category'],predictions_rfc['category']))

[[371011  14612      0      0      0]
 [ 38278 291276      0      1      0]
 [     0     42      0     58      0]
 [ 11372    882      0   6157      0]
 [     0      2      0     14      0]]
              precision    recall  f1-score   support

           0       0.88      0.96      0.92    385623
           1       0.95      0.88      0.92    329555
           2       0.00      0.00      0.00       100
           3       0.99      0.33      0.50     18411
           4       0.00      0.00      0.00        16

    accuracy                           0.91    733705
   macro avg       0.56      0.44      0.47    733705
weighted avg       0.91      0.91      0.91    733705



In [25]:
print(confusion_matrix(y_test['subcategory'],predictions_rfc['subcategory']))
print(classification_report(y_test['subcategory'],predictions_rfc['subcategory']))

[[     0      0      0      0      1    504      0]
 [     0      0      0      0     14      1      1]
 [     0      0      0      0     60     36      4]
 [     0      0      0      0    200   3427      3]
 [     0      0      0      0   7135   7641      5]
 [     0      0      0      0      0 318543    188]
 [     0      0      0      0      0      7 395935]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       505
           2       0.00      0.00      0.00        16
           3       0.00      0.00      0.00       100
           4       0.00      0.00      0.00      3630
           5       0.96      0.48      0.64     14781
           6       0.96      1.00      0.98    318731
           7       1.00      1.00      1.00    395942

    accuracy                           0.98    733705
   macro avg       0.42      0.35      0.37    733705
weighted avg       0.98      0.98      0.98    733705



In [26]:
rf.rfc_attack.score(X_test,y_test['attack'])

0.9998718831137855

In [27]:
rf.rfc_category.score(np.concatenate((X_test,np.array(predictions_rfc['attack']).reshape(-1,1)),axis=1),y_test['category'])

0.9110528073271955

In [28]:
rf.rfc_subcategory.score(np.concatenate((X_test,np.array(predictions_rfc['attack']).reshape(-1,1),
                                    np.array(predictions_rfc['category']).reshape(-1,1)),axis=1),y_test['subcategory'])

0.9835192618286641

**Naive Bayes**

In [29]:
nb = NaiveBayes()
nb.fit(X_train,y_train)

predictions_nb = nb.predict(X_test)

In [30]:
print(confusion_matrix(y_test['attack'],predictions_nb['attack']))
print(classification_report(y_test['attack'],predictions_nb['attack']))

[[    93      7]
 [  2774 730831]]
              precision    recall  f1-score   support

           0       0.03      0.93      0.06       100
           1       1.00      1.00      1.00    733605

    accuracy                           1.00    733705
   macro avg       0.52      0.96      0.53    733705
weighted avg       1.00      1.00      1.00    733705



In [31]:
print(confusion_matrix(y_test['category'],predictions_nb['category']))
print(classification_report(y_test['category'],predictions_nb['category']))

[[368899  16551     35    138      0]
 [183002 145492    658    403      0]
 [     0      6     93      1      0]
 [ 10850   1389   2077   4095      0]
 [     0      0      4      0     12]]
              precision    recall  f1-score   support

           0       0.66      0.96      0.78    385623
           1       0.89      0.44      0.59    329555
           2       0.03      0.93      0.06       100
           3       0.88      0.22      0.36     18411
           4       1.00      0.75      0.86        16

    accuracy                           0.71    733705
   macro avg       0.69      0.66      0.53    733705
weighted avg       0.77      0.71      0.68    733705



In [32]:
print(confusion_matrix(y_test['subcategory'],predictions_nb['subcategory']))
print(classification_report(y_test['subcategory'],predictions_nb['subcategory']))

[[   325      0      8     34    122     16      0]
 [     0     12      4      0      0      0      0]
 [     1      0     93      1      0      5      0]
 [   229      0    121      1    137   3142      0]
 [   579      0   1956    761   3196   8289      0]
 [   818      0    676     18    367 316852      0]
 [     0      0      9      0      0     24 395909]]
              precision    recall  f1-score   support

           1       0.17      0.64      0.26       505
           2       1.00      0.75      0.86        16
           3       0.03      0.93      0.06       100
           4       0.00      0.00      0.00      3630
           5       0.84      0.22      0.34     14781
           6       0.97      0.99      0.98    318731
           7       1.00      1.00      1.00    395942

    accuracy                           0.98    733705
   macro avg       0.57      0.65      0.50    733705
weighted avg       0.98      0.98      0.97    733705



**Decision Tree (Information Gain)**

In [33]:
dtree_ig = DecisionTree('entropy')
dtree_ig.fit(X_train,y_train)
predictions_dtree_ig = dtree_ig.predict(X_test)

In [34]:
print(confusion_matrix(y_test['attack'],predictions_dtree_ig['attack']))
print(classification_report(y_test['attack'],predictions_dtree_ig['attack']))

[[    40     60]
 [     4 733601]]
              precision    recall  f1-score   support

           0       0.91      0.40      0.56       100
           1       1.00      1.00      1.00    733605

    accuracy                           1.00    733705
   macro avg       0.95      0.70      0.78    733705
weighted avg       1.00      1.00      1.00    733705



In [35]:
print(confusion_matrix(y_test['category'],predictions_dtree_ig['category']))
print(classification_report(y_test['category'],predictions_dtree_ig['category']))

[[339221  45872      0    530      0]
 [ 11603 317911      0     41      0]
 [     7     13     33     47      0]
 [   355   7565      1  10490      0]
 [     0      1      0     15      0]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.92    385623
           1       0.86      0.96      0.91    329555
           2       0.97      0.33      0.49       100
           3       0.94      0.57      0.71     18411
           4       0.00      0.00      0.00        16

    accuracy                           0.91    733705
   macro avg       0.75      0.55      0.61    733705
weighted avg       0.92      0.91      0.91    733705



In [36]:
print(confusion_matrix(y_test['subcategory'],predictions_dtree_ig['subcategory']))
print(classification_report(y_test['subcategory'],predictions_dtree_ig['subcategory']))

[[   387      0      0      0     79     39      0]
 [     1      1      0      0     14      0      0]
 [     0      0     33      8     37     10     12]
 [     0      0      2    949    506   2154     19]
 [    22      0      0    298   8738   5712     11]
 [    14      0      0      0    478 318237      2]
 [     0      0      0      0      0      4 395938]]
              precision    recall  f1-score   support

           1       0.91      0.77      0.83       505
           2       1.00      0.06      0.12        16
           3       0.94      0.33      0.49       100
           4       0.76      0.26      0.39      3630
           5       0.89      0.59      0.71     14781
           6       0.98      1.00      0.99    318731
           7       1.00      1.00      1.00    395942

    accuracy                           0.99    733705
   macro avg       0.92      0.57      0.65    733705
weighted avg       0.99      0.99      0.99    733705



In [37]:
dtree_ig.dtree_attack.score(X_test,y_test['attack'])

0.9999127714817263

In [38]:
dtree_ig.dtree_category.score(np.concatenate((X_test,np.array(predictions_dtree_ig['attack']).reshape(-1,1)),axis=1),y_test['category'])

0.9099774432503527

In [39]:
dtree_ig.dtree_subcategory.score(np.concatenate((X_test,np.array(predictions_dtree_ig['attack']).reshape(-1,1),
                                        np.array(predictions_dtree_ig['category']).reshape(-1,1)),axis=1),y_test['subcategory'])

0.9871583265753947

**Decision Tree (Gini Index)**

In [40]:
dtree_gini = DecisionTree('gini')
dtree_gini.fit(X_train,y_train)
predictions_dtree_gini = dtree_gini.predict(X_test)

In [41]:
print(confusion_matrix(y_test['attack'],predictions_dtree_gini['attack']))
print(classification_report(y_test['attack'],predictions_dtree_gini['attack']))

[[    51     49]
 [    10 733595]]
              precision    recall  f1-score   support

           0       0.84      0.51      0.63       100
           1       1.00      1.00      1.00    733605

    accuracy                           1.00    733705
   macro avg       0.92      0.75      0.82    733705
weighted avg       1.00      1.00      1.00    733705



In [42]:
print(confusion_matrix(y_test['category'],predictions_dtree_gini['category']))
print(classification_report(y_test['category'],predictions_dtree_gini['category']))

[[338156  46990      0    477      0]
 [  8479 321072      1      3      0]
 [     0     19     41     40      0]
 [    78   7496      4  10833      0]
 [     0      1      1     14      0]]
              precision    recall  f1-score   support

           0       0.98      0.88      0.92    385623
           1       0.85      0.97      0.91    329555
           2       0.87      0.41      0.56       100
           3       0.95      0.59      0.73     18411
           4       0.00      0.00      0.00        16

    accuracy                           0.91    733705
   macro avg       0.73      0.57      0.62    733705
weighted avg       0.92      0.91      0.91    733705



In [43]:
print(confusion_matrix(y_test['subcategory'],predictions_dtree_gini['subcategory']))
print(classification_report(y_test['subcategory'],predictions_dtree_gini['subcategory']))

[[   413      0      0      0      4     88      0]
 [     0      1      0      0     15      0      0]
 [    10      0      4      9     71      4      2]
 [     7      0      0    819    694   2110      0]
 [    16      0      1     71   9253   5440      0]
 [    67      0      0      0    477 318187      0]
 [     0      0      0      0      0      7 395935]]
              precision    recall  f1-score   support

           1       0.81      0.82      0.81       505
           2       1.00      0.06      0.12        16
           3       0.80      0.04      0.08       100
           4       0.91      0.23      0.36      3630
           5       0.88      0.63      0.73     14781
           6       0.98      1.00      0.99    318731
           7       1.00      1.00      1.00    395942

    accuracy                           0.99    733705
   macro avg       0.91      0.54      0.58    733705
weighted avg       0.99      0.99      0.99    733705



In [44]:
dtree_gini.dtree_attack.score(X_test,y_test['attack'])

0.9999195862097164

In [45]:
dtree_gini.dtree_category.score(np.concatenate((X_test,np.array(predictions_dtree_gini['attack']).reshape(-1,1)),axis=1),y_test['category'])

0.9133125711287234

In [46]:
dtree_gini.dtree_subcategory.score(np.concatenate((X_test,np.array(predictions_dtree_gini['attack']).reshape(-1,1),
                                        np.array(predictions_dtree_gini['category']).reshape(-1,1)),axis=1),y_test['subcategory'])

0.9876067356771454

**Gradient Boost**

In [47]:
xgb = GradientBoost()
xgb.fit(X_train,y_train)
predictions_xgb = xgb.predict(X_test)



In [48]:
print(confusion_matrix(y_test['attack'],predictions_xgb['attack']))
print(classification_report(y_test['attack'],predictions_xgb['attack']))

[[    99      1]
 [     1 733604]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       100
           1       1.00      1.00      1.00    733605

    accuracy                           1.00    733705
   macro avg       0.99      0.99      0.99    733705
weighted avg       1.00      1.00      1.00    733705



In [49]:
print(confusion_matrix(y_test['category'],predictions_xgb['category']))
print(classification_report(y_test['category'],predictions_xgb['category']))

[[385608     15      0      0      0]
 [    14 329541      0      0      0]
 [     0      1     99      0      0]
 [     1      0      1  18409      0]
 [     0      1      0      1     14]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    385623
           1       1.00      1.00      1.00    329555
           2       0.99      0.99      0.99       100
           3       1.00      1.00      1.00     18411
           4       1.00      0.88      0.93        16

    accuracy                           1.00    733705
   macro avg       1.00      0.97      0.98    733705
weighted avg       1.00      1.00      1.00    733705



In [50]:
print(confusion_matrix(y_test['subcategory'],predictions_xgb['subcategory']))
print(classification_report(y_test['subcategory'],predictions_xgb['subcategory']))

[[   504      0      0      0      0      1      0]
 [     1     14      0      0      1      0      0]
 [     0      0     99      0      0      1      0]
 [     0      0      0   3377    252      1      0]
 [     0      0      1    212  14568      0      0]
 [     0      0      0      0      0 318730      1]
 [     0      0      0      0      0      4 395938]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       505
           2       1.00      0.88      0.93        16
           3       0.99      0.99      0.99       100
           4       0.94      0.93      0.94      3630
           5       0.98      0.99      0.98     14781
           6       1.00      1.00      1.00    318731
           7       1.00      1.00      1.00    395942

    accuracy                           1.00    733705
   macro avg       0.99      0.97      0.98    733705
weighted avg       1.00      1.00      1.00    733705



In [51]:
xgb.xgb_attack.score(X_test,y_test['attack'])

0.9999972741088039

In [52]:
xgb.xgb_category.score(np.concatenate((X_test,np.array(predictions_xgb['attack']).reshape(-1,1)),axis=1),y_test['category'])

0.9999536598496671

In [53]:
xgb.xgb_subcategory.score(np.concatenate((X_test,np.array(predictions_xgb['attack']).reshape(-1,1),
                                        np.array(predictions_xgb['category']).reshape(-1,1)),axis=1),y_test['subcategory'])

0.9993526008409375

### Test Data

In [55]:
test_data = pd.read_csv("UNSW_2018_IoT_Botnet_Final_10_best_Testing.csv")
test_data.head()

Unnamed: 0,pkSeqID,proto,saddr,sport,daddr,dport,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max,attack,category,subcategory
0,792371,udp,192.168.100.150,48516,192.168.100.3,80,175094,0.226784,100,4.100436,4,4.457383,100,0.0,0.404711,4.719438,1,DoS,UDP
1,2056418,tcp,192.168.100.148,22267,192.168.100.3,80,143024,0.451998,100,3.439257,1,3.806172,100,0.225077,0.401397,4.44293,1,DDoS,TCP
2,2795650,udp,192.168.100.149,28629,192.168.100.3,80,167033,1.931553,73,0.0,4,2.731204,100,0.0,0.407287,4.138455,1,DDoS,UDP
3,2118009,tcp,192.168.100.148,42142,192.168.100.3,80,204615,0.428798,56,3.271411,1,3.626428,100,0.0,0.343654,4.2297,1,DDoS,TCP
4,303688,tcp,192.168.100.149,1645,192.168.100.5,80,40058,2.058381,100,0.0,3,1.188407,100,0.0,0.135842,4.753628,1,DoS,TCP


In [56]:
# Extracting the ten-best features from test set
test_ten_best = test_data[ten_best_features.columns]
test_labels = test_data[['attack','category','subcategory']]
test_ten_best.head()

Unnamed: 0,seq,stddev,N_IN_Conn_P_SrcIP,min,state_number,mean,N_IN_Conn_P_DstIP,drate,srate,max
0,175094,0.226784,100,4.100436,4,4.457383,100,0.0,0.404711,4.719438
1,143024,0.451998,100,3.439257,1,3.806172,100,0.225077,0.401397,4.44293
2,167033,1.931553,73,0.0,4,2.731204,100,0.0,0.407287,4.138455
3,204615,0.428798,56,3.271411,1,3.626428,100,0.0,0.343654,4.2297
4,40058,2.058381,100,0.0,3,1.188407,100,0.0,0.135842,4.753628


**Using complete training data and test data**

In [57]:
# Standard Scaling
train_data = sc.fit_transform(ten_best_features)
test = sc.transform(test_ten_best)

In [72]:
ten_best_features = data[['seq','stddev','N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max']]
target_features = data[['attack','category','subcategory']]

In [73]:
# Label Encoding

target_features['category'] = le.fit_transform(target_features['category'])
test_labels['category'] = le.transform(test_labels['category'])

target_features['subcategory'] = le.fit_transform(target_features['subcategory'])
test_labels['subcategory'] = le.transform(test_labels['subcategory'])

### Training ML Models on Complete Data and Testing on test set

**Random Forest**

In [74]:
rf_clf = RandomForest(max_depth=5)
rf_clf.fit(train_data,target_features)
predictions_rf = rf_clf.predict(test)

In [75]:
print("Random Forest: Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_rf['attack']),"\n")
print(classification_report(test_labels['attack'],predictions_rf['attack']))

Random Forest: Attack

[[    38     69]
 [     0 733598]] 

              precision    recall  f1-score   support

           0       1.00      0.36      0.52       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       1.00      0.68      0.76    733705
weighted avg       1.00      1.00      1.00    733705



In [76]:
print("Random Forest: Category\n")
print(confusion_matrix(test_labels['category'],predictions_rf['category']))
print(classification_report(test_labels['category'],predictions_rf['category']))

Random Forest: Category

[[372668  12639      0      2      0]
 [ 10976 319135      0      1      0]
 [     1     25     26     55      0]
 [   498     33      0  17632      0]
 [     0      5      0      9      0]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    385309
           1       0.96      0.97      0.96    330112
           2       1.00      0.24      0.39       107
           3       1.00      0.97      0.98     18163
           4       0.00      0.00      0.00        14

    accuracy                           0.97    733705
   macro avg       0.79      0.63      0.66    733705
weighted avg       0.97      0.97      0.97    733705



In [77]:
print("Random Forest: Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_rf['subcategory']))
print(classification_report(test_labels['subcategory'],predictions_rf['subcategory']))

Random Forest: Subcategory

[[    77      0      0      0      2    425      0]
 [     0      0      0      0      9      0      5]
 [     1      0     26      0     55     23      2]
 [     0      0      0      0   3486    134      1]
 [     0      0      0      0  14146    395      1]
 [     0      0      0      0      1 318214    122]
 [     0      0      0      0      0     12 396568]]
              precision    recall  f1-score   support

           1       0.99      0.15      0.26       504
           2       0.00      0.00      0.00        14
           3       1.00      0.24      0.39       107
           4       0.00      0.00      0.00      3621
           5       0.80      0.97      0.88     14542
           6       1.00      1.00      1.00    318337
           7       1.00      1.00      1.00    396580

    accuracy                           0.99    733705
   macro avg       0.68      0.48      0.50    733705
weighted avg       0.99      0.99      0.99    733705



**Naive Bayes**

In [78]:
nb_clf = NaiveBayes()
nb_clf.fit(train_data,target_features)
predictions_nb = nb_clf.predict(test)

In [79]:
print("Naive Bayes: Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_nb['attack']))
print(classification_report(test_labels['attack'],predictions_nb['attack']))

Naive Bayes: Attack

[[    96     11]
 [  2775 730823]]
              precision    recall  f1-score   support

           0       0.03      0.90      0.06       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       0.52      0.95      0.53    733705
weighted avg       1.00      1.00      1.00    733705



In [80]:
print("Naive Bayes: Category\n")
print(confusion_matrix(test_labels['category'],predictions_nb['category']),"\n")
print(classification_report(test_labels['category'],predictions_nb['category']))

Naive Bayes: Category

[[368894  16264     31    120      0]
 [182703 146381    699    329      0]
 [     0      9     96      2      0]
 [ 10751   1630   2040   3742      0]
 [     0      1      5      0      8]] 

              precision    recall  f1-score   support

           0       0.66      0.96      0.78    385309
           1       0.89      0.44      0.59    330112
           2       0.03      0.90      0.06       107
           3       0.89      0.21      0.33     18163
           4       1.00      0.57      0.73        14

    accuracy                           0.71    733705
   macro avg       0.69      0.62      0.50    733705
weighted avg       0.77      0.71      0.68    733705



In [81]:
print("Naive Bayes: Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_nb['subcategory']))
print(classification_report(test_labels['subcategory'],predictions_nb['subcategory']))

Naive Bayes: Subcategory

[[     0      0      0      0      0      0      0      0]
 [     0    337      0      7      0    146     14      0]
 [     1      0      7      5      0      0      1      0]
 [     0      0      0     96      1      1      9      0]
 [     0    199      0    128      0    151   3143      0]
 [     0    578      0   1912    612   2979   8461      0]
 [     0    815      0    715      5    296 316506      0]
 [     0      0      0      8      2      0     23 396547]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.17      0.67      0.28       504
           2       1.00      0.50      0.67        14
           3       0.03      0.90      0.06       107
           4       0.00      0.00      0.00      3621
           5       0.83      0.20      0.33     14542
           6       0.96      0.99      0.98    318337
           7       1.00      1.00      1.00    396580

    accuracy 

**Decision Tree (Information Gain)**

In [82]:
dtree_ig_clf = DecisionTree('entropy')
dtree_ig_clf.fit(train_data,target_features)
predictions_ig = dtree_ig_clf.predict(test)

In [83]:
print("Decision Tree (Information Gain): Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_ig['attack']),"\n")
print(classification_report(test_labels['attack'],predictions_ig['attack']))

Decision Tree (Information Gain): Attack

[[    40     67]
 [     2 733596]] 

              precision    recall  f1-score   support

           0       0.95      0.37      0.54       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       0.98      0.69      0.77    733705
weighted avg       1.00      1.00      1.00    733705



In [84]:
print("Decision Tree (Information Gain): Category\n")
print(confusion_matrix(test_labels['category'],predictions_ig['category']))
print(classification_report(test_labels['category'],predictions_ig['category']))

Decision Tree (Information Gain): Category

[[339497  45290      0    522      0]
 [ 11703 318367      0     42      0]
 [     9     15     33     50      0]
 [   351   7450      2  10360      0]
 [     0      0      0     14      0]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.92    385309
           1       0.86      0.96      0.91    330112
           2       0.94      0.31      0.46       107
           3       0.94      0.57      0.71     18163
           4       0.00      0.00      0.00        14

    accuracy                           0.91    733705
   macro avg       0.74      0.54      0.60    733705
weighted avg       0.92      0.91      0.91    733705



In [85]:
print("Decision Tree (Information Gain): Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_ig['subcategory']),"\n")
print(classification_report(test_labels['subcategory'],predictions_ig['subcategory']))

Decision Tree (Information Gain): Subcategory

[[   384      0      0      0     87     33      0]
 [     0      5      0      0      9      0      0]
 [     0      0     48      4     44      9      2]
 [     0      0      2    941    574   2090     14]
 [    27      0      5    222   8623   5654     11]
 [    23      0      0      0    453 317861      0]
 [     0      0      0      0      0      8 396572]] 

              precision    recall  f1-score   support

           1       0.88      0.76      0.82       504
           2       1.00      0.36      0.53        14
           3       0.87      0.45      0.59       107
           4       0.81      0.26      0.39      3621
           5       0.88      0.59      0.71     14542
           6       0.98      1.00      0.99    318337
           7       1.00      1.00      1.00    396580

    accuracy                           0.99    733705
   macro avg       0.92      0.63      0.72    733705
weighted avg       0.99      0.99      0.99 

**Decision Tree (Gini Index)**

In [86]:
dtree_gi_clf = DecisionTree('gini')
dtree_gi_clf.fit(train_data,target_features)
predictions_gi = dtree_gi_clf.predict(test)

In [87]:
print("Decision Tree (Gini Index): Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_gi['attack']),"\n")
print(classification_report(test_labels['attack'],predictions_gi['attack']))

Decision Tree (Gini Index): Attack

[[    38     69]
 [     3 733595]] 

              precision    recall  f1-score   support

           0       0.93      0.36      0.51       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       0.96      0.68      0.76    733705
weighted avg       1.00      1.00      1.00    733705



In [88]:
print("Decision Tree (Gini Index): Category\n")
print(confusion_matrix(test_labels['category'],predictions_gi['category']))
print(classification_report(test_labels['category'],predictions_gi['category']))

Decision Tree (Gini Index): Category

[[338396  46453      0    460      0]
 [  8631 321472      1      8      0]
 [     0     18     32     57      0]
 [    68   7383      2  10710      0]
 [     0      5      0      9      0]]
              precision    recall  f1-score   support

           0       0.97      0.88      0.92    385309
           1       0.86      0.97      0.91    330112
           2       0.91      0.30      0.45       107
           3       0.95      0.59      0.73     18163
           4       0.00      0.00      0.00        14

    accuracy                           0.91    733705
   macro avg       0.74      0.55      0.60    733705
weighted avg       0.92      0.91      0.91    733705



In [89]:
print("Decision Tree (Gini Index): Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_gi['subcategory']),"\n")
print(classification_report(test_labels['subcategory'],predictions_gi['subcategory']))

Decision Tree (Gini Index): Subcategory

[[   416      0      0      0     10     78      0]
 [     0      5      0      0      9      0      0]
 [     1      0      5      5     82     14      0]
 [     0      0      1    860    710   2050      0]
 [     4      3      0     71   9069   5394      1]
 [    63      0      0      0    459 317814      1]
 [     0      0      0      0      0      9 396571]] 

              precision    recall  f1-score   support

           1       0.86      0.83      0.84       504
           2       0.62      0.36      0.45        14
           3       0.83      0.05      0.09       107
           4       0.92      0.24      0.38      3621
           5       0.88      0.62      0.73     14542
           6       0.98      1.00      0.99    318337
           7       1.00      1.00      1.00    396580

    accuracy                           0.99    733705
   macro avg       0.87      0.58      0.64    733705
weighted avg       0.99      0.99      0.99    733

**Gradient Boost**

In [90]:
xgb_clf = GradientBoost()
xgb_clf.fit(train_data,target_features)
predictions_gb = xgb_clf.predict(test)



In [91]:
print("Gradient Boost: Attack\n")
print(confusion_matrix(test_labels['attack'],predictions_gb['attack']),"\n")
print(classification_report(test_labels['attack'],predictions_gb['attack']))

Gradient Boost: Attack

[[   100      7]
 [     1 733597]] 

              precision    recall  f1-score   support

           0       0.99      0.93      0.96       107
           1       1.00      1.00      1.00    733598

    accuracy                           1.00    733705
   macro avg       1.00      0.97      0.98    733705
weighted avg       1.00      1.00      1.00    733705



In [92]:
print("Gradient Boost: Category\n")
print(confusion_matrix(test_labels['category'],predictions_gb['category']),"\n")
print(classification_report(test_labels['category'],predictions_gb['category']))

Gradient Boost: Category

[[385296     12      0      1      0]
 [    17 330094      1      0      0]
 [     0      0    100      7      0]
 [     0      1      0  18162      0]
 [     0      0      0      0     14]] 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    385309
           1       1.00      1.00      1.00    330112
           2       0.99      0.93      0.96       107
           3       1.00      1.00      1.00     18163
           4       1.00      1.00      1.00        14

    accuracy                           1.00    733705
   macro avg       1.00      0.99      0.99    733705
weighted avg       1.00      1.00      1.00    733705



In [93]:
print("Gradient Boost: Subcategory\n")
print(confusion_matrix(test_labels['subcategory'],predictions_gb['subcategory']),"\n")
print(classification_report(test_labels['subcategory'],predictions_gb['subcategory']))

Gradient Boost: Subcategory

[[   496      0      1      0      1      6      0]
 [     0     14      0      0      0      0      0]
 [     0      0    100      0      7      0      0]
 [     0      0      0   3350    271      0      0]
 [     1      0      0    227  14314      0      0]
 [     0      0      0      0      0 318337      0]
 [     1      0      0      0      0      5 396574]] 

              precision    recall  f1-score   support

           1       1.00      0.98      0.99       504
           2       1.00      1.00      1.00        14
           3       0.99      0.93      0.96       107
           4       0.94      0.93      0.93      3621
           5       0.98      0.98      0.98     14542
           6       1.00      1.00      1.00    318337
           7       1.00      1.00      1.00    396580

    accuracy                           1.00    733705
   macro avg       0.99      0.98      0.98    733705
weighted avg       1.00      1.00      1.00    733705



# END OF NOTEBOOK