This is Final Project material for the "Deep Learning" class I took  

## Download the dataset from https://www.unb.ca/cic/datasets/ids-2017.html
We only need CSV files that is preprocessed and labeled for ML 

In [None]:
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
%load_ext autoreload

In [None]:
# load data
dataroot = '/home/jovyan/ikt590'
SEED=2

In [None]:
from preprocessing import load_data
X,y = load_data(dataroot)

In [None]:
from preprocessing import balance_data, normalize
X = normalize(X)

In [None]:
X.shape

In [None]:
from models import Classifier

def ensure_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

def getClassifier(args,runs_dir=None):
    
    (method,optim,lr,reg,batch_size,input_dim,num_class,num_epochs) = args
    if runs_dir is not None:
        ensure_dir(runs_dir)
    
    clf = Classifier(method,input_dim,num_class,lr=lr,reg=reg,num_epochs=num_epochs,
                        batch_size=batch_size,runs_dir=runs_dir)
    return clf

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics

#hyperparameters
batch_size = 4096*5 # increasing batch size with more gpu added
optim = 'Adam'

num_epochs = 100
learning_rates = [1e-3]
regularizations = [1e-3]


accuracies = {}
best_model = None
best_acc = -1
architecture = '1DCNNconv_2_fc_1'
run_number = 4
method='cnn2'

# Cross-validation
K=5
skf = StratifiedKFold(n_splits=K,random_state=SEED, shuffle=True)
for fold_index, (train_index,test_index) in enumerate(skf.split(X,y)):
        print('---------------------------------------------')
        print('Fold #{}'.format(fold_index))    
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        input_dim = X_train.shape[1]
        num_class = len(np.unique(y_train))
        
        for lr in learning_rates:
            for reg in regularizations:

                classifier_args = (method,optim,lr,reg,batch_size,input_dim,num_class,num_epochs)
                config =  '{}/{}th_run/optim_{}_lr_{}_reg_{}_bs_{}'.format(architecture,run_number,optim,lr,reg,batch_size)
                runs_dir = join(dataroot,'runs',config)

                X_train = X_train.astype(float)
                y_train = y_train.astype(int)
                p = np.random.permutation(len(y_train))
                X_train = X_train[p]
                y_train = y_train[p]
                X_train,y_train = balance_data(X_train,y_train,seed=SEED)

                tick = time.time()
                clf = getClassifier(classifier_args,runs_dir)

                clf.fit(X_train,y_train)
                pred = clf.predict(X_test,eval_mode=True)

                acc = metrics.balanced_accuracy_score(y_test,pred)*100
                if acc >best_acc:
                    best_model = clf
                    best_acc = acc
                accuracies[(lr,reg)]=acc
                tock = time.time() # Calculating the time it takes to train the algorithm
                
                print("Model is trained in {} sec".format(tock-tick))
                
                #Cross validation results
                print("Balanced test accuracy: ", acc)
                
            # Labels
            target_names = ['Benign', 'Botnet', 'DDoS', 'DoS GoldenEye', 'DoS Hulk',
                            'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator', 'Heartbleed', 
                            'Infiltration', 'PortScan', 'SSH-Patator', 'Web Attack - Brute Force',
                            'Web Attack - Sql Injection', 'Web Attack - XSS']

            # Confusion metrics
            print(classification_report(y_test, pred, target_names=target_names))

            # Confusion metrics heat map
            conf_matrix = confusion_matrix(y_test, pred)
            cm_df = pd.DataFrame(conf_matrix)
            plt.figure(figsize=(20,15))
            sns.set(font_scale=1.4)
            sns.heatmap(cm_df, annot=True, annot_kws={"size":12}, fmt='g', xticklabels=target_names,
                        yticklabels=target_names)
            plt.ylabel('Actual Class')
            plt.xlabel('Predicted Class')

            plt.show()
            
            # Metrics that might be needed
            # FP: False Positive
            # FN: False Negative
            # TP: True Positive
            # TN: True Negative
            FP = conf_matrix.sum(axis=0) - np.diag(conf_matrix)  
            FN = conf_matrix.sum(axis=1) - np.diag(conf_matrix)
            TP = np.diag(conf_matrix)
            TN = conf_matrix.sum() - (FP + FN + TP)


            FP = FP.astype(float)
            FN = FN.astype(float)
            TP = TP.astype(float)
            TN = TN.astype(float)

            # True Positive Rate (TPR) / Recall / Detection Rate (DR)
            TPR = TP/(TP+FN)
            # True Negative Rate (TNR) / Specificity
            TNR = TN/(TN+FP) 
            #  Positive Predictive Value (PPV) / Precision
            PPV = TP/(TP+FP)
            # Negative Predictive Value (NPV)
            NPV = TN/(TN+FN)
            # False Positive Rate (FPR)
            FPR = FP/(FP+TN)
            # False Negative Rate (FNR)
            FNR = FN/(TP+FN)
            # False Discovery Rate (FDR)
            FDR = FP/(TP+FP)
            # Overall accuracy. Not needed due to the imbalance of the dataset
            ACC = (TP+TN)/(TP+FP+FN+TN)           