In [1]:
import os
from os.path import join
import glob
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
%load_ext autoreload

In [2]:
dataroot = '/home/jovyan/ikt590'
SEED=20

In [3]:
from preprocessing import load_data
X,y = load_data(dataroot) # reads csv file and returns np array of X,y -> of shape (N,D) and (N,1)

/home/jovyan/ikt590/*.pcap_ISCX.csv
there are 2830743 flow records with 79 feature dimension
stripped column names
dropped bad columns
There are 0 nan entries
converted to numeric
[0 0 0 ... 0 0 0]


In [4]:
from preprocessing import balance_data, normalize
X = normalize(X)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Cross Validation
K=5
skf = StratifiedKFold(n_splits=K,random_state=SEED, shuffle=True)
for fold_index, (dev_index,test_index) in enumerate(skf.split(X,y)): 
    print('---------------------------------------------')
    print('Fold #{}'.format(fold_index))    
    X_dev = X[dev_index]
    y_dev = y[dev_index]
    X_test = X[test_index]
    y_test = y[test_index]
    
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
    for train_index, val_index in sss.split(X_dev, y_dev): # runs only once
        X_train = X_dev[train_index]
        y_train = y_dev[train_index]
        X_val = X_dev[val_index]
        y_val = y_dev[val_index]
        break
        
    X_train,y_train = balance_data(X_train,y_train,seed=SEED)
    clf = RandomForestClassifier(random_state=SEED, n_estimators=20, min_samples_split=2, min_samples_leaf=1)
    clf.fit(X_train,y_train)
 
    pred = clf.predict(X_val)
    val_acc = metrics.balanced_accuracy_score(y_val,pred)*100

    pred = clf.predict(X_test)
    test_acc = metrics.balanced_accuracy_score(y_test,pred)*100
    
    # Cross validation results
    print('balanced test set accuracy: ',test_acc)
    
    # Lables
    target_names = ['Benign', 'Botnet', 'DDoS', 'DoS GoldenEye', 'DoS Hulk',
                            'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator', 'Heartbleed', 
                            'Infiltration', 'PortScan', 'SSH-Patator', 'Web Attack - Brute Force',
                            'Web Attack - Sql Injection', 'Web Attack - XSS']

    # Confusion metrics
    print(classification_report(y_test, pred, target_names=target_names))
    
    # Confusion metrics heat map
    conf_matrix = confusion_matrix(y_test, pred)
    cm_df = pd.DataFrame(conf_matrix)
    plt.figure(figsize=(20,15))
    sns.set(font_scale=1.4)
    sns.heatmap(cm_df, annot=True, annot_kws={"size":12}, fmt='g', xticklabels=target_names,
                yticklabels=target_names)
    plt.ylabel('Actual Class')
    plt.xlabel('Predicted Class')

    plt.show()


    # Metrics that might be needed
    # FP: False Positive
    # FN: False Negative
    # TP: True Positive
    # TN: True Negative
    FP = conf_matrix.sum(axis=0) - np.diag(conf_matrix)  
    FN = conf_matrix.sum(axis=1) - np.diag(conf_matrix)
    TP = np.diag(conf_matrix)
    TN = conf_matrix.sum() - (FP + FN + TP)


    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    # True Positive Rate (TPR) / Recall / Detection Rate (DR)
    TPR = TP/(TP+FN)
    # True Negative Rate (TNR) / Specificity
    TNR = TN/(TN+FP) 
    #  Positive Predictive Value (PPV) / Precision
    PPV = TP/(TP+FP)
    # Negative Predictive Value (NPV)
    NPV = TN/(TN+FN)
    # False Positive Rate (FPR)
    FPR = FP/(FP+TN)
    # False Negative Rate (FNR)
    FNR = FN/(TP+FN)
    # False Discovery Rate (FDR)
    FDR = FP/(TP+FP)
    # Overall accuracy. Not needed due to the imbalance of the dataset
    ACC = (TP+TN)/(TP+FP+FN+TN)           