## Read in the Data and initialize the Dask Client 

In [3]:
# Intro data set and problem 
# Each go through model 
# Do diagram 
# Do 4 Vs, 1 V each 


#imports
import dask, joblib
import dask.dataframe as dd
from dask.distributed import Client, progress
import dask.array as da
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler
from dask_ml import decomposition, linear_model, metrics
from dask_ml.impute import SimpleImputer
from dask_ml.model_selection import train_test_split, KFold
from dask_ml.preprocessing import OneHotEncoder, StandardScaler
from sklearn import tree, ensemble, svm, naive_bayes, neighbors, cluster
from sklearn.metrics import (
    accuracy_score, precision_score, 
    confusion_matrix, recall_score, 
    plot_confusion_matrix
)
import matplotlib.pyplot as plt 
import xgboost as xgb
import pandas as pd
import numpy as np

def cat_features(dataframe):
    td = pd.DataFrame({'a': [1,2,3], 'b': [1.0, 2.0, 3.0]})
    return filter(lambda x: not(dataframe[x].dtype in [td['a'].dtype, td['b'].dtype]), list(dataframe))

#close existing dask connection if it exists and open a new one
try:
    if client is not None:
        client.close()
        print("closed existing connection, ",client)       
except Exception as e:
    print(e)

name 'client' is not defined


In [4]:
#get new connection
client = Client(n_workers=2, threads_per_worker=2, memory_limit='8GB')
display(client)

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:36091  Dashboard: http://127.0.0.1:35657/status,Cluster  Workers: 2  Cores: 4  Memory: 16.00 GB


## Read in the data and concatenate the dask dataframes of multiple data files 

In [5]:
# Declaring the 

feat_types_check = {'Dst Port': float, 'Protocol': int, 'Timestamp': str, 
                    'Flow Duration': int, 'Tot Fwd Pkts': int, 'Tot Bwd Pkts': int, 
                    'TotLen Fwd Pkts': int, 'TotLen Bwd Pkts': int, 'Flow Pkts/s': float,  
                    'Fwd PSH Flags': int, 'Bwd PSH Flags': int, 'Fwd URG Flags': int, 
                    'Bwd URG Flags': int, 'Fwd Pkts/s': float, 'Bwd Pkts/s': float,
                    'FIN Flag Cnt': int, 'SYN Flag Cnt': int, 'RST Flag Cnt': int, 
                    'PSH Flag Cnt': int, 'ACK Flag Cnt': int, 'URG Flag Cnt': int, 
                    'CWE Flag Count': int, 'ECE Flag Cnt': int, 'Subflow Fwd Pkts': int, 
                    'Subflow Fwd Byts': int, 'Subflow Bwd Pkts': int, 'Subflow Bwd Byts': int, 
                    'Label': str}

feature_cols = ['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 
                'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
                'TotLen Bwd Pkts', 'Flow IAT Mean', 'Fwd PSH Flags', 
                'Bwd PSH Flags', 'Fwd URG Flags','Bwd URG Flags', 
                'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 
                'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
                'CWE Flag Count', 'ECE Flag Cnt', 'Subflow Fwd Pkts',
                'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts', 
                'Label']

# Multiple tests had been done on the differnt files we had available to us 

df_1 = dd.read_csv('cic_data/14_02_2018.csv', usecols=feature_cols, low_memory=False)
df_2 = dd.read_csv('cic_data/15_02_2018.csv', usecols=feature_cols, low_memory=False)
test_df = dd.read_csv('cic_data/16_02_2018_fixeddata.csv', usecols=feature_cols, low_memory=False)

# test_df = dd.read_csv('cic_data/23_02_2018.csv', usecols=feature_cols, low_memory=False)
# test_df = dd.read_csv('cic_data/22_02_2018.csv', usecols=feature_cols, low_memory=False)

df = df_1.append(df_2)
# df_1 = df_1.append(df_2)
# df = df[df['Label'].isin(['Brute Force -Web', 'Brute Force -XSS', 'Benign', 'FTP-BruteForce', 'SSH-BruteForce'])]

# test_df = test_df.append(test_df_2)
# test_df = test_df[test_df['Label'].isin(['Brute Force -Web', 'Brute Force -XSS', 'Benign', 'FTP-BruteForce', 'SSH-BruteForce'])]

print("Length of Dataset: ", len(df))
print("Number of Columns: ", len(df.columns))
print('\n')
print("Length of Test Dataset:", len(test_df))
print("Number of Columns: ", len(test_df.columns))
# print("Categorical Features: ", list(cat_features(df)))
# Files in the zipped folder:
# 23_02_2018.csv, 20_02_2018.csv, 16_02_2018.csv, 02_03_2018.csv
# with ZipFile('cic_data.zip') as zipped:
#     df = dd.from_pandas(pd.read_csv(zipped.open('23_02_2018.csv'), usecols=feature_cols), npartitions=2)
#     df = df.append(pd.read_csv(zipped.open('16_02_2018.csv'), usecols=feature_cols))

Length of Dataset:  2097150
Number of Columns:  26


Length of Test Dataset: 1048574
Number of Columns:  26


## Create the Target Variable by Declaring a new column of 1 if the Label Column does not equal 'Benign'

In [6]:
df['Target'] = 0 
df['Target'] = df['Target'].mask(df['Label'] != 'Benign', 1)

test_df['Target'] = 0
test_df['Target'] = test_df['Target'].mask(test_df['Label'] != 'Benign', 1)

In [7]:
y = df['Target']
x = df.drop(['Label', 'Target', 'Timestamp'], axis=1)

test_y = test_df['Target']
test_x = test_df.drop(['Label', 'Target', 'Timestamp'], axis=1)

## Function for Train Test Split and Cross validation

In [8]:
#function that takes X, Y and gives you a train test split
def doTrainTestSplit(X,Y):
    #breakpoint()
    #look at dask dataframes
#     display(X.head())
#     display(Y.head())
   
    #get splits
    X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=.3)
#     display(X_train.compute())
#     display(y_train.compute())
    return X_train, X_test, y_train, y_test 

In [9]:
def doKFolds(X,Y):
    
    #look at dask dataframes
    display(X.head())
    display(yD.head())

    #create KFold object
    c = KFold()
    #breakpoint()
    #split on dask arrays, doesn't work on dataframes yet
    gen = c.split(X.to_dask_array(lengths=True),Y.to_dask_array(lengths=True))

    #inspect generator
    print(gen)
    display(type(gen))
    
    #call generator
    for train,test in gen:
        print("train = ",train.compute())
        print("test = ",test.compute())
        print("x train = ",X.loc[train])
        got = X.loc[train.compute()]
        display(got.head())
        #print(got.compute())
        clf.fit(got,got)


In [10]:
x_train, x_test, y_train, y_test = doTrainTestSplit(x,y)

In [11]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
scaler = StandardScaler()

In [12]:
def error_metrics(mod, feats, labels):
    preds = mod.predict(feats.compute())
    labels = labels.compute()
    print("Precision: ", precision_score(labels, preds))
    print("Accuracy: ", accuracy_score(labels, preds))
    print("Recall: ", recall_score(labels, preds))
    print("Confusion Matrix: ", confusion_matrix(labels, preds))

# def cluster_eval(mod, feats, labels):
#     preds = mod.predict(feats.compute())
    

## Declare the classifiers, multiple were first test but in my code I settled on XGBoost

In [22]:
# Main XGboost model used in my code 
xgb_clf = xgb.XGBClassifier(
    n_estimators=5, max_depth=6,
    sampling_method='uniform', 
    tree_method='hist', objective='binary:logistic', 
    booster='dart'
)
# print(np.sum(x_train.compute().memory_usage())/1e6)

# Alternate models originally tested 
bayes_clf = naive_bayes.GaussianNB()
multibayes_clf = naive_bayes.MultinomialNB()
rf_clf= tree.DecisionTreeClassifier()
knn_clf = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance')
cluster_clf = cluster.KMeans(n_clusters=2)

In [23]:
xgb_clf.fit(x_train.compute(), y_train.compute(), eval_metric="auc")
# bayes_clf.fit(x_train.compute(), y_train.compute())
# rf_clf.fit(x_train.compute(), y_train.compute())
# cluster_clf.fit(x_train.compute())
# knn_clf.fit(x_train.compute(), y_train.compute())

XGBClassifier(booster='dart', max_depth=6, n_estimators=5,
              sampling_method='uniform', tree_method='hist')

In [24]:
error_metrics(xgb_clf, x_test, y_test)
# error_metrics(bayes_clf, x_test, y_test)
# error_metrics(rf_clf, x_test, y_test)
# error_metrics(knn_clf, x_test, y_test)

Precision:  0.994872314394146
Accuracy:  0.9956057502715547
Recall:  0.983809318311422
Confusion Matrix:  [[497442    658]
 [  2101 127665]]


## Display relevant error metrics

In [25]:
error_metrics(xgb_clf, test_x, test_y)
# error_metrics(bayes_clf, test_x, test_y)
# error_metrics(rf_clf, test_x, test_y)
# error_metrics(knn_clf, test_x, test_y)

Precision:  0.9472793130661774
Accuracy:  0.5525532771173041
Recall:  0.23336080637817755
Confusion Matrix:  [[438956   7816]
 [461365 140437]]


In [None]:
import matplotlib.pyplot as plt 
plt.show(plot_confusion_matrix(xgb_clf, test_x.compute(), test_y.compute()))

In [None]:
# plt.show(plot_confusion_matrix(bayes_clf, test_x.compute(), test_y.compute()))

In [None]:
# accuracy_score(test_y.compute(), bayes_clf.predict(test_x.compute()))

## Timing for comparision between the Pandas and the Dask Threading 

In [None]:
# %%timeit -n 10
# #time with just pandas
# with joblib.parallel_backend('threading'):
#      xgb_clf.fit(x_train.compute(), y_train.compute(), eval_metric=precision_score)

In [None]:
# %%timeit -n 10
# #time with just pandas
# with joblib.parallel_backend('dask'):
#      xgb_clf.fit(x_train.compute(), y_train.compute(), eval_metric=precision_score)