# Imports

In [119]:
import numpy as np
import pandas as pd
import sqlalchemy

import os
from datetime import datetime


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 

In [71]:
from functions import load_balanced_df

Load and combine files for balanced database.

In [74]:
balanced_df = load_balanced_df(sample_size=1000)
balanced_df.shape

(36000, 83)

In [78]:
##########
### DROP NA (Temporary)
##########
balanced_df.dropna(how='any', axis=0, inplace=True)
balanced_df.shape

(35799, 83)

In [79]:
balanced_df.columns

Index(['Flow_ID', 'Flow_Duration', 'Fwd_Total_Pkts', 'Bwd_Total_Pkts',
       'Fwd_Total_Bytes', 'Bwd_Total_Bytes', 'Fwd_Pkt_Length_Max',
       'Fwd_Pkt_Length_Min', 'Fwd_Pkt_Length_Mean', 'Fwd_Pkt_Length_Std',
       'Bwd_Pkt_Length_Max', 'Bwd_Pkt_Length_Min', 'Bwd_Pkt_Length_Mean',
       'Bwd_Pkt_Length_Std', 'Flow_Bytes_Sec', 'Flow_Pkts_Sec',
       'Flow_IAT_Mean', 'Flow_IAT_Std', 'Flow_IAT_Max', 'Flow_IAT_Min',
       'Fwd_IAT_Total', 'Fwd_IAT_Mean', 'Fwd_IAT_Std', 'Fwd_IAT_Max',
       'Fwd_IAT_Min', 'Bwd_IAT_Total', 'Bwd_IAT_Mean', 'Bwd_IAT_Std',
       'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Fwd_PSH_Flags', 'Bwd_PSH_Flags',
       'Fwd_URG_Flags', 'Bwd_URG_Flags', 'Fwd_Header_Length',
       'Bwd_Header_Length', 'Fwd_Pkts_Sec', 'Bwd_Pkts_Sec', 'Pkt_Length_Min',
       'Pkt_Length_Max', 'Pkt_Length_Mean', 'Pkt_Length_Std', 'Pkt_Length_Var',
       'FIN_Flag_Count', 'SYN_Flag_Count', 'RST_Flag_Count', 'PSH_Flag_Count',
       'ACK_Flag_Count', 'URG_Flag_Count', 'CWE_Flag_Count', 'ECE_Fl

Check to make sure dataframe is balanced

In [80]:
balanced_df['Malicious'].value_counts()

0    18000
1    17799
Name: Malicious, dtype: int64

# Balanced, Binary Classification

## Train Test Split

In [120]:
X = balanced_df.drop(['Flow_ID', 'Label', 'HOPOPT', 'Malicious'], axis=1)
y = balanced_df['Malicious']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Standard Scalar

In [121]:
std_scaler = StandardScaler()

X_train_scl = std_scaler.fit_transform(X_train)
X_test_scl = std_scaler.transform(X_test)

## Dummy Classifier

In [84]:
dummy = DummyClassifier()
dummy.fit(X_train_scl, y_train)
dummy_preds = dummy.predict(X_test_scl)



In [85]:
print('Accuracy:', accuracy_score(dummy_preds, y_test), '   F1:', f1_score(dummy_preds, y_test))

Accuracy: 0.4912849162011173    F1: 0.48905846706318035


## Gaussian Naive Bayes 

In [86]:
gnb = GaussianNB()
gnb.fit(X_train_scl, y_train)
gnb_preds = gnb.predict(X_test_scl)

In [87]:
print('Accuracy:', accuracy_score(gnb_preds, y_test), '   F1:', f1_score(gnb_preds, y_test))

Accuracy: 0.8220111731843576    F1: 0.8442053789731052


## Decision Tree

In [88]:
dt = DecisionTreeClassifier()
dt.fit(X_train_scl, y_train)
dt_preds = dt.predict(X_test_scl)

In [89]:
print('Accuracy:', accuracy_score(dt_preds, y_test), '   F1:', f1_score(dt_preds, y_test))

Accuracy: 0.998659217877095    F1: 0.9986568166554735


## K Nearest Neighbors

In [102]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scl, y_train)
knn_preds = knn.predict(X_test_scl)

In [103]:
print('Accuracy:', accuracy_score(knn_preds, y_test), '   F1:', f1_score(knn_preds, y_test))

Accuracy: 0.9969832402234637    F1: 0.9969829031176667


## Random Forest

In [112]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train_scl, y_train)
rf_preds = rf.predict(X_test_scl)

In [113]:
print('Accuracy:', accuracy_score(rf_preds, y_test), '   F1:', f1_score(rf_preds, y_test))

Accuracy: 0.9992178770949721    F1: 0.9992156862745099


## XGBoost

In [112]:
xgb = XGBClassifier()
rf.fit(X_train_scl, y_train)
rf_preds = rf.predict(X_test_scl)

In [113]:
print('Accuracy:', accuracy_score(rf_preds, y_test), '   F1:', f1_score(rf_preds, y_test))

Accuracy: 0.9992178770949721    F1: 0.9992156862745099


# Balanced, Multiclass Classification

## Train Test Split

In [114]:
X = balanced_df.drop(['Flow_ID', 'Label', 'HOPOPT', 'Malicious'], axis=1)
y = balanced_df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Standard Scalar

In [115]:
std_scaler = StandardScaler()

X_train_scl = std_scaler.fit_transform(X_train)
X_test_scl = std_scaler.transform(X_test)

## Dummy Classifier

In [92]:
dummy = DummyClassifier()
dummy.fit(X_train_scl, y_train)
dummy_preds = dummy.predict(X_test_scl)



In [93]:
print('Accuracy:', accuracy_score(dummy_preds, y_test), '   F1:', f1_score(dummy_preds, y_test, average='weighted'))

Accuracy: 0.06603351955307263    F1: 0.06675845541559995


## Gaussian Naive Bayes

In [94]:
gnb = GaussianNB()
gnb.fit(X_train_scl, y_train)
gnb_preds = gnb.predict(X_test_scl)

In [95]:
print('Accuracy:', accuracy_score(gnb_preds, y_test), '   F1:', f1_score(gnb_preds, y_test, average='weighted'))

Accuracy: 0.2417877094972067    F1: 0.2698570761677164


## Decision Tree

In [96]:
dt = DecisionTreeClassifier()
dt.fit(X_train_scl, y_train)
dt_preds = dt.predict(X_test_scl)

In [97]:
print('Accuracy:', accuracy_score(dt_preds, y_test), '   F1:', f1_score(dt_preds, y_test, average='weighted'))

Accuracy: 0.6963128491620112    F1: 0.7109997383557819


## K Nearest Neighbors

In [106]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scl, y_train)
knn_preds = knn.predict(X_test_scl)

In [108]:
print('Accuracy:', accuracy_score(knn_preds, y_test), '   F1:', f1_score(knn_preds, y_test, average='weighted'))

Accuracy: 0.6735195530726257    F1: 0.6865562266340534


## Random Forest

In [116]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train_scl, y_train)
rf_preds = rf.predict(X_test_scl)

In [118]:
print('Accuracy:', accuracy_score(rf_preds, y_test), '   F1:', f1_score(rf_preds, y_test, average='weighted'))

Accuracy: 0.720782122905028    F1: 0.7399412179951796
