# Imports

In [1]:
import numpy as np
import pandas as pd
import sqlalchemy

import os
from datetime import datetime


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier 

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 

List cleaned files

In [2]:
for file in os.listdir('Datasets/Cleaned/'):
    print(file)

clean_03-11_Syn.csv
clean_01-12_Syn.csv
clean_01-12_DrDoS_MSSQL.csv
.DS_Store
clean_benign.csv
clean_01-12_DrDoS_SSDP.csv
clean_03-11_MSSQL.csv
clean_01-12_DrDoS_UDP.csv
clean_01-12_DrDoS_NetBIOS.csv
clean_03-11_UDP.csv
clean_01-12_TFTP.csv
clean_03-11_NetBIOS.csv
clean_01-12_DrDoS_SNMP.csv
clean_01-12_DrDoS_LDAP.csv
clean_01-12_DrDoS_DNS.csv
clean_01-12_UDPLag.csv
clean_03-11_Portmap.csv
clean_01-12_DrDoS_NTP.csv
clean_03-11_LDAP.csv
clean_03-11_UDPLag.csv


Load and combine files

In [11]:
df = pd.read_csv('Datasets/Cleaned/clean_benign.csv', index_col=0, nrows=1800000)
print(datetime.now().time(), 'Benign Database Shape: ', df.shape, '\n')

for file in os.listdir('Datasets/Cleaned/'):
    if file[0] == '.':
        print('Dot File\n')
        pass
    elif file == 'clean_benign.csv':
        print(file, '\n')
        pass
    else:
        temp_df = pd.read_csv(f'Datasets/Cleaned/{file}', index_col=0, nrows=100000)
        print(datetime.now().time(), '-', file)
        print('Shape: ', temp_df.shape, '\n')
        df = pd.concat([df, temp_df])
        del temp_df
        
df.reset_index(drop=True, inplace=True)
print('Combined Database Shape: ', df.shape)

  mask |= (ar1 == a)


14:29:46.313734 Benign Database Shape:  (1800000, 83) 

14:29:47.077533 - clean_03-11_Syn.csv
Shape:  (100000, 83) 

14:29:49.484264 - clean_01-12_Syn.csv
Shape:  (100000, 83) 

14:29:53.350438 - clean_01-12_DrDoS_MSSQL.csv
Shape:  (100000, 83) 

Dot File

clean_benign.csv 

14:29:57.504298 - clean_01-12_DrDoS_SSDP.csv
Shape:  (100000, 83) 

14:30:01.817835 - clean_03-11_MSSQL.csv
Shape:  (100000, 83) 

14:30:06.446096 - clean_01-12_DrDoS_UDP.csv
Shape:  (100000, 83) 

14:30:11.165834 - clean_01-12_DrDoS_NetBIOS.csv
Shape:  (100000, 83) 

14:30:16.226467 - clean_03-11_UDP.csv
Shape:  (100000, 83) 

14:30:21.482704 - clean_01-12_TFTP.csv
Shape:  (100000, 83) 

14:30:26.905551 - clean_03-11_NetBIOS.csv
Shape:  (100000, 83) 

14:30:32.501768 - clean_01-12_DrDoS_SNMP.csv
Shape:  (100000, 83) 

14:30:38.301051 - clean_01-12_DrDoS_LDAP.csv
Shape:  (100000, 83) 

14:30:44.102087 - clean_01-12_DrDoS_DNS.csv
Shape:  (100000, 83) 

14:30:50.389650 - clean_01-12_UDPLag.csv
Shape:  (100000, 83) 



In [12]:
df.columns

Index(['Flow_ID', 'Flow_Duration', 'Fwd_Total_Pkts', 'Bwd_Total_Pkts',
       'Fwd_Total_Bytes', 'Bwd_Total_Bytes', 'Fwd_Pkt_Length_Max',
       'Fwd_Pkt_Length_Min', 'Fwd_Pkt_Length_Mean', 'Fwd_Pkt_Length_Std',
       'Bwd_Pkt_Length_Max', 'Bwd_Pkt_Length_Min', 'Bwd_Pkt_Length_Mean',
       'Bwd_Pkt_Length_Std', 'Flow_Bytes_Sec', 'Flow_Pkts_Sec',
       'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min',
       'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max',
       'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean', 'Bwd_IAT_Std',
       'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_PSH_Flags', 'Bwd_PSH_Flags',
       'Fwd_URG_Flags', 'Bwd_URG_Flags', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Pkts_Sec', 'Bwd_Pkts_Sec', 'Pkt_Length_Min',
       'Pkt_Length_Max', 'Pkt_Length_Mean', 'Pkt_Length_Std', 'Pkt_Length_Var',
       'FIN_Flag_Count', 'SYN_Flag_Count', 'RST_Flag_Count', 'PSH_Flag_Count',
       'ACK_Flag_Count', 'URG_Flag_Count', 'CWE_Flag_Count', 'ECE_Fl

# Train Test Split

In [16]:
X = df.drop(['Flow_ID', 'Label', 'HOPOPT', 'Malicious'], axis=1)
y = df['Malicious']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Standard Scalar

In [22]:
for col in df.columns:
    print(col)
    print(max(df[col]))

Flow_ID
8.5.1.51-10.200.7.218-80-53741-6
Flow_Duration
120000000
Fwd_Total_Pkts
453190
Bwd_Total_Pkts
542196
Fwd_Total_Bytes
678023588.0
Bwd_Total_Bytes
1345795830.0
Fwd_Pkt_Length_Max
32832.0
Fwd_Pkt_Length_Min
16060.0
Fwd_Pkt_Length_Mean
16060.0
Fwd_Pkt_Length_Std
6225.4871946384
Bwd_Pkt_Length_Max
37648.0
Bwd_Pkt_Length_Min
13032.0
Bwd_Pkt_Length_Mean
13032.0
Bwd_Pkt_Length_Std
8434.8040331073
Flow_Bytes_Sec
inf
Flow_Pkts_Sec
inf
Flow_IAT_Mean
119999998.0
Flow_IAT_Std
84851741.76850541
Flow_IAT_Max
119999998.0
Flow_IAT_Min
119999998.0
Fwd_IAT_Total
120000000.0
Fwd_IAT_Mean
119999998.0
Fwd_IAT_Std
84851969.456889
Fwd_IAT_Max
119999998.0
Fwd_IAT_Min
119999998.0
Bwd_IAT_Total
119999995.0
Bwd_IAT_Mean
119998626.0
Bwd_IAT_Std
84852748.6885618
Bwd_IAT_Max
119999910.0
Bwd_IAT_Min
119998626.0
Fwd_PSH_Flags
1
Bwd_PSH_Flags
0
Fwd_URG_Flags
0
Bwd_URG_Flags
0
Fwd_Header_Length
15439500
Bwd_Header_Length
12844400
Fwd_Pkts_Sec
6000000.0
Bwd_Pkts_Sec
4000000.0
Pkt_Length_Min
7063.0
Pkt_Length_Max


In [24]:
df['Flow_Bytes_Sec'].describe()

count    3.581411e+06
mean              inf
std               NaN
min      0.000000e+00
25%      1.020358e+02
50%      2.464066e+04
75%      4.490000e+08
max               inf
Name: Flow_Bytes_Sec, dtype: float64

In [27]:
df[df['Flow_Bytes_Sec'] == float('inf')]

Unnamed: 0,Flow_ID,Flow_Duration,Fwd_Total_Pkts,Bwd_Total_Pkts,Fwd_Total_Bytes,Bwd_Total_Bytes,Fwd_Pkt_Length_Max,Fwd_Pkt_Length_Min,Fwd_Pkt_Length_Mean,Fwd_Pkt_Length_Std,...,Time_Active_Min,Time_Idle_Mean,Time_Idle_Std,Time_Idle_Max,Time_Idle_Min,Label,HOPOPT,TCP,UDP,Malicious
1800003,172.16.0.5-192.168.50.4-33828-1431-6,0,2,0,12.0,0.0,6.0,6.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Syn,0,1,0,1
1800010,172.16.0.5-192.168.50.4-33834-22356-6,0,2,0,12.0,0.0,6.0,6.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Syn,0,1,0,1
1800029,172.16.0.5-192.168.50.4-33846-55855-6,0,2,0,12.0,0.0,6.0,6.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Syn,0,1,0,1
1800030,172.16.0.5-192.168.50.4-33847-52855-6,0,2,0,12.0,0.0,6.0,6.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Syn,0,1,0,1
1800037,172.16.0.5-192.168.50.4-33851-35500-6,0,2,0,12.0,0.0,6.0,6.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,Syn,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3593283,172.16.0.5-192.168.50.4-37400-28749-17,0,2,0,766.0,0.0,383.0,383.0,383.0,0.0,...,0.0,0.0,0.0,0.0,0.0,UDP,0,0,1,1
3593287,172.16.0.5-192.168.50.4-37401-35984-17,0,2,0,766.0,0.0,383.0,383.0,383.0,0.0,...,0.0,0.0,0.0,0.0,0.0,UDP,0,0,1,1
3593346,172.16.0.5-192.168.50.4-33402-14777-17,0,2,0,802.0,0.0,401.0,401.0,401.0,0.0,...,0.0,0.0,0.0,0.0,0.0,UDP,0,0,1,1
3594025,172.16.0.5-192.168.50.4-40518-64952-17,0,2,0,802.0,0.0,401.0,401.0,401.0,0.0,...,0.0,0.0,0.0,0.0,0.0,UDP,0,0,1,1


In [51]:
df[df['Flow_Bytes_Sec'] == float('inf')]['Flow_Bytes_Sec'][1800003]

inf

In [54]:
max(df[df['Flow_Pkts_Sec'] != float('inf')]['Flow_Pkts_Sec'].value_counts())

954322

In [53]:
max(df[df['Flow_Bytes_Sec'] != float('inf')]['Flow_Bytes_Sec'].value_counts())

522546

In [55]:
high_pps = 100000000
high_bps = 1000000000000

In [63]:
1000000000000

1000000000000

In [57]:
df['Flow_Pkts_Sec'].replace(float('inf'), high_pps, inplace=True)
df['Flow_Bytes_Sec'].replace(float('inf'), high_bps, inplace=True)

In [59]:
max(df['Flow_Pkts_Sec']), max(df['Flow_Bytes_Sec'])

(100000000.0, 14396000000.0)

In [52]:
df[df['Flow_Bytes_Sec'] == float('inf')]['Label'].value_counts()

NetBIOS          10184
Syn               5977
Portmap           5369
MSSQL             4774
DrDoS_DNS         4220
DrDoS_NetBIOS     3596
DrDoS_LDAP        2500
DrDoS_MSSQL       1856
DrDoS_SNMP        1535
DrDoS_UDP         1196
DrDoS_SSDP        1109
UDP                999
DrDoS_NTP          816
UDP-lag            247
LDAP                 7
TFTP                 1
Name: Label, dtype: int64

In [30]:
df[df['Malicious'] != 1]

Unnamed: 0,Flow_ID,Flow_Duration,Fwd_Total_Pkts,Bwd_Total_Pkts,Fwd_Total_Bytes,Bwd_Total_Bytes,Fwd_Pkt_Length_Max,Fwd_Pkt_Length_Min,Fwd_Pkt_Length_Mean,Fwd_Pkt_Length_Std,...,Time_Active_Min,Time_Idle_Mean,Time_Idle_Std,Time_Idle_Max,Time_Idle_Min,Label,HOPOPT,TCP,UDP,Malicious
0,172.19.1.46-10.200.7.7-52422-3128-6,45523,22,55,132.0,110414.0,6.0,6.0,6.000000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,Benign_HTTP_PROXY,0,1,0,0
1,172.19.1.46-10.200.7.7-52422-3128-6,1,2,0,12.0,0.0,6.0,6.0,6.000000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,Benign_HTTP_PROXY,0,1,0,0
2,10.200.7.217-50.31.185.39-38848-80-6,1,3,0,674.0,0.0,337.0,0.0,224.666667,194.567041,...,0.0,0.0,0.00000,0.0,0.0,Benign_HTTP,0,1,0,0
3,10.200.7.217-50.31.185.39-38848-80-6,217,1,3,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,Benign_HTTP,0,1,0,0
4,192.168.72.43-10.200.7.7-55961-3128-6,78068,5,0,1076.0,0.0,529.0,6.0,215.200000,286.458898,...,0.0,0.0,0.00000,0.0,0.0,Benign_HTTP_PROXY,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1799995,192.168.32.40-10.200.7.7-56499-3128-6,89947650,13,13,8514.0,11494.0,1589.0,6.0,654.923077,750.702389,...,9.0,29894489.0,40991.84281,29941476.0,29866046.0,Benign_HTTP,0,1,0,0
1799996,172.16.141.247-10.200.7.7-2221-59877-6,217,1,2,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,Benign_HTTP,0,1,0,0
1799997,10.200.7.7-10.20.2.114-56493-443-6,169,1,2,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,Benign_SSL,0,1,0,0
1799998,10.200.7.7-10.20.2.114-37977-443-6,149761,14,12,2664.0,2682.0,652.0,0.0,190.285714,271.877073,...,0.0,0.0,0.00000,0.0,0.0,Benign_SSL,0,1,0,0


In [42]:
for col in df.columns:
    if df[col].isin([float('inf')]).any():
        print(col)

Flow_Bytes_Sec
Flow_Pkts_Sec


In [43]:
type(float('inf'))

float

In [20]:
std_scaler = StandardScaler()

X_train_scl = std_scaler.fit_transform(X_train)
X_test_scl = std_scaler.transform(X_test)

ValueError: Input contains infinity or a value too large for dtype('float64').

# Dummy Classifier

In [19]:
dummy = DummyClassifier()
dummy.fit(X)

In [None]:
dclf.fit(X_train, y_train) 
    score = dclf.score(X_test, y_test) 
    test_scores.append(score) 