# FastAI Experiments

In [5]:
from fastai.tabular import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import os
import sys
import glob
from sklearn.utils import shuffle

## Download dataset if not done already

In [6]:
! curl -O https://iscxdownloads.cs.unb.ca/iscxdownloads/ISCX-Tor-NonTor-2017/TorCSV.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 17.5M  100 17.5M    0     0  1770k      0  0:00:10  0:00:10 --:--:-- 2209k


## check for TorCSV.zip file

In [7]:
! ls

CICDataSet-TOR	environment.yml		  LICENSE  notebooks  scripts
CSV		FastAI-Experiments.ipynb  media    README.md  TorCSV.zip


## unzip TorCSV.zip file

In [8]:
! unzip TorCSV.zip

Archive:  TorCSV.zip
replace CSV/Scenario-A/SelectedFeatures-10s-TOR-NonTOR.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [20]:
! ls CSV
# you'll see CSV root folder two subfolders Scenario-A and Scenario-B inside it

Scenario-A  Scenario-B


## Clean Data
- drop columns 'Source IP' and 'Destination IP'
    - these features do not provide packet properties that aid in traffic classification
- drop rows with values with NaN and Infinity
    - can't properly represent these values

In [21]:
def loadData(csvFile):
    pickleDump = '{}.pickle'.format(csvFile)
    if os.path.exists(pickleDump):
        df = pd.read_pickle(pickleDump)
    else:
        df = pd.read_csv(csvFile, low_memory=False)
        # clean data
        # Note: there's a leading space for each column name except for the first one
        # let's strip the whitspaces from column names
        df = df.rename(str.strip, axis='columns')
        df.drop(columns=['Source IP', 'Destination IP'], inplace=True)
        # drop missing values/NaN etc.
        df.dropna(inplace=True)
        # drop Infinity rows and NaN string from each column
        for col in df.columns:
            indexNames = df[df[col]=='Infinity'].index
            if not indexNames.empty:
                print('deleting {} rows with Infinity in column {}'.format(len(indexNames), col))
                df.drop(indexNames, inplace=True)
            indexNames = df[df[col]=='NaN'].index
            if not indexNames.empty:
                print('deleting {} rows with NaN in column {}'.format(len(indexNames), col))
                df.drop(indexNames, inplace=True)
        
        # convert  Flow Bytes/s object & Flow Packets/s object into float type
        df['Flow Bytes/s'] = df['Flow Bytes/s'].astype('float64')
        df['Flow Packets/s'] = df['Flow Packets/s'].astype('float64')
        print(df.tail())
        df.to_pickle(pickleDump)
    
    return df

## Experimenting with Scenario-A Dataset

In [22]:
dataPath = 'CSV/Scenario-A'
csvFile = os.path.join(dataPath, 'merged_5s.csv')

In [23]:
df = loadData(csvFile)

In [24]:
# check datatypes of each features
df.dtypes

Source Port           int64
Destination Port      int64
Protocol              int64
Flow Duration         int64
Flow Bytes/s        float64
Flow Packets/s      float64
Flow IAT Mean       float64
Flow IAT Std        float64
Flow IAT Max          int64
Flow IAT Min          int64
Fwd IAT Mean        float64
Fwd IAT Std         float64
Fwd IAT Max           int64
Fwd IAT Min           int64
Bwd IAT Mean        float64
Bwd IAT Std         float64
Bwd IAT Max           int64
Bwd IAT Min           int64
Active Mean           int64
Active Std            int64
Active Max            int64
Active Min            int64
Idle Mean             int64
Idle Std              int64
Idle Max              int64
Idle Min              int64
label                object
dtype: object

In [27]:
df.shape

(84192, 27)

In [26]:
# total Tor samples
print('total TOR:', len(df[df['label'] == 'TOR'].index))
print('total nonTOR: ', len(df[df['label'] == 'nonTOR'].index))

total TOR: 14508
total nonTOR:  69684


In [16]:
dep_var = 'label'
cat_names = ['Source Port', 'Destination Port', 'Protocol']
cont_names = list(set(df.columns) - set(cat_names) - set([dep_var]))

In [17]:
cont_names

['Idle Std',
 'Bwd IAT Max',
 'Fwd IAT Max',
 'Flow IAT Mean',
 'Active Std',
 'Fwd IAT Mean',
 'Flow Duration',
 'Idle Max',
 'Flow IAT Std',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Flow Packets/s',
 'Fwd IAT Std',
 'Active Min',
 'Flow Bytes/s',
 'Bwd IAT Min',
 'Active Max',
 'Active Mean',
 'Idle Min',
 'Idle Mean',
 'Flow IAT Min',
 'Fwd IAT Min',
 'Flow IAT Max']

In [18]:
procs = [FillMissing, Categorify, Normalize]
sss = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=0)
print(sss)

StratifiedShuffleSplit(n_splits=1, random_state=0, test_size=0.2,
            train_size=None)


In [28]:
for train_idx, test_idx in sss.split(df.index, df[dep_var]):
    data_fold = (TabularList.from_df(df, path=dataPath, cat_names=cat_names, cont_names=cont_names, procs=procs)
                     .split_by_idxs(train_idx, test_idx)
                     .label_from_df(cols=dep_var)
                     .databunch())
    # create model and learn
    model = tabular_learner(data_fold, layers=[50, 20], metrics=accuracy, callback_fns=ShowGraph)
    model.fit_one_cycle(cyc_len=10) #
    model.save('{}.model'.format(os.path.basename(csvFile)))

epoch,train_loss,valid_loss,accuracy,time


KeyboardInterrupt: 

In [None]:
loss, acc = model.validate()
print('loss {}: accuracy: {:.2f}%'.format(loss, acc*100))

In [None]:
preds, y, losses = model.get_preds(with_loss=True)
interp = ClassificationInterpretation(model, preds, y, losses)
interp.plot_confusion_matrix()

## Experiment with Scenario-B Dataset

In [None]:
dataPath = 'CSV/Scenario-B'
csvFile = os.path.join(dataPath, 'merged_5s.csv')
df = loadData(csvFile)

In [None]:
df.dtypes

In [None]:
# see all the labels
labels = set(df['label'])
print('all the labels:', labels)
# data districution for each label
for label in labels:
    print('total {} = {}'.format(label, len(df[df['label'] == label].index)))


In [None]:
df.shape

In [None]:
def experiment(df):
    procs = [FillMissing, Categorify, Normalize]
    sss = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=0)
    for train_idx, test_idx in sss.split(df.index, df[dep_var]):
        data_fold = (TabularList.from_df(df, path=dataPath, cat_names=cat_names, cont_names=cont_names, procs=procs)
                         .split_by_idxs(train_idx, test_idx)
                         .label_from_df(cols=dep_var)
                         .databunch())
        # create model and learn
        model = tabular_learner(data_fold, layers=[50, 20], metrics=accuracy, callback_fns=ShowGraph)
        model.fit_one_cycle(cyc_len=10) # learn for 10 epochs
        model.save('{}.model'.format(os.path.basename(csvFile)))
    return model

In [None]:
model = experiment(df)

In [None]:
loss, acc = model.validate()
print('loss {}: accuracy: {:.2f}%'.format(loss, acc*100))

In [None]:
def drawConfusionMatrix(model):
    preds, y, losses = model.get_preds(with_loss=True)
    interp = ClassificationInterpretation(model, preds, y, losses)
    interp.plot_confusion_matrix()

In [None]:
drawConfusionMatrix(model)