# Keras - Deep Learning on Scenario A

Here, we will be applying deep neural networks in an to attempt to differentiate between Tor and nonTor data from the ISCXTor2016 dataset. 

In [1]:
# DataFrame handling
import pandas as pd

# keras Models
from keras.models import Sequential
from keras.layers import Dense

# sklearn Models
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Split data with stratified cv
from sklearn.model_selection import StratifiedKFold

# Encoding of classifications
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

print('Imports complete.')

Using TensorFlow backend.


Imports complete.


In [2]:
# Set up a few constants to keep track of
random_state=1
path='../../tor_dataset/Scenario-A/'
dep_var = 'class'
num_classes=0

In [3]:
def get_Xy(filename='', verbose=False):
    """
        This function takes a filename, loads the data into a dataframe, then separates the classification data
        
        args:
            filename => str, path to csv file to be loaded
            
        returns:
            list(X,y) => data, classifications
    """
    df = pd.read_csv(filename)
    
    if verbose:
        print('Before encoding and splitting:')
        print(df.head())
    
    # Actual data
    X = df.loc[:, df.columns != dep_var]
    
    # Set number of classes we see
    num_classes = df[dep_var].nunique()
    
    # Classifications
    encoder = LabelEncoder()
    y = encoder.fit_transform(df[dep_var])
    
    if verbose:
        print('Classification encoding:')
        for i in range(len(encoder.classes_)):
            print('\t{} => {}'.format(i, encoder.classes_[i]))
        
        print('After encoding and splitting:')
        print('X = ')
        print(X.head())
        print('\ny = ')
        print(y[:5])
    
    # X holds the data while y holds the classifications
    return X, y

In [4]:
# All of the data files
files=['TimeBasedFeatures-15s-TOR-NonTOR.csv', 
       'TimeBasedFeatures-30s-TOR-NonTOR.csv', 
       'TimeBasedFeatures-60s-TOR-NonTOR.csv', 
       'TimeBasedFeatures-120s-TOR-NonTOR.csv']

# Lists for accuracies collected from models
list_dummy = []
list_dt = []
list_knn = []
list_dnn = []

for file in files:
    print('Training for {}...'.format(file), end='')
    
    # Load in the data
    X, y = get_Xy(path + file)
    
    # Mean accuracies for each model
    mean_dummy = 0 # This is the worst kind of dummy
    mean_dt = 0
    mean_knn = 0
    mean_dnn = 0
    
    # 10-fold Stratified Cross-Validation
    n_splits = 10
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for train_idxs, test_idxs in skf.split(X, y):
        # Define the training and testing sets
        X_train, X_test = X.iloc[train_idxs], X.iloc[test_idxs]
        y_train, y_test = y[train_idxs], y[test_idxs]
        
        # Create a different version of the y_train and y_test for the Deep Neural Network
        y_train_dnn = to_categorical(y_train, num_classes=num_classes)
        y_test_dnn = to_categorical(y_test, num_classes=num_classes)
        
        # Initialize the sklearn models
        dummy = DummyClassifier(strategy='most_frequent')
        dt = DecisionTreeClassifier(random_state=random_state)
        knn = KNeighborsClassifier()
        
        # Deep Neural Network
        dnn = Sequential([
            Dense(64, input_shape=(23,)),
            Dense(32, activation='relu'),
            Dense(2, activation='softmax')
        ])
        dnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        
        
        # Train the models
        dummy.fit(X_train, y_train)
        dt.fit(X_train, y_train)
        knn.fit(X_train, y_train)
        dnn.fit(x=X_train, y=y_train_dnn, epochs=1, batch_size=20, verbose=0, validation_data=(X_test, y_test_dnn))
        
        # Evaluate the models
        results_dummy = dummy.score(X_test, y_test)
        results_dt = dt.score(X_test, y_test)
        results_knn = knn.score(X_test, y_test)  
        results_dnn = ( dnn.evaluate(X_test, y_test_dnn, verbose=0) )[1]
        
        # Add the results to the running mean
        mean_dummy += results_dummy / (n_splits * 1.0)
        mean_dt += results_dt / (n_splits * 1.0)
        mean_knn += results_knn / (n_splits * 1.0)
        mean_dnn += results_dnn / (n_splits * 1.0)
    
    # Push the mean results from all of the splits to the lists
    list_dummy.append(mean_dummy)
    list_dt.append(mean_dt)
    list_knn.append(mean_knn)
    list_dnn.append(mean_dnn)
    
    print('done')
    
print('All trainings complete!')

Training for TimeBasedFeatures-15s-TOR-NonTOR.csv...done
Training for TimeBasedFeatures-30s-TOR-NonTOR.csv...done
Training for TimeBasedFeatures-60s-TOR-NonTOR.csv...done
Training for TimeBasedFeatures-120s-TOR-NonTOR.csv...done
All trainings complete!


In [5]:
# Output results
print('File\t\t\t\t\tDummy\tDecision Tree\tk-Nearest Neighbor\tDeep Neural Network')
print('-'*82)
for i in range(len(files)):
    print('{}\t{:.2f}%\t{:.2f}%\t\t{:.2f}%\t\t\t{:.2f}%'.format(files[i], 100*list_dummy[i], 100*list_dt[i], 100*list_knn[i], 100*list_dnn[i]))

File					Dummy	Decision Tree	k-Nearest Neighbor	Deep Neural Network
----------------------------------------------------------------------------------
TimeBasedFeatures-15s-TOR-NonTOR.csv	84.99%	99.91%		99.88%			99.90%
TimeBasedFeatures-30s-TOR-NonTOR.csv	89.22%	99.90%		99.93%			98.50%
TimeBasedFeatures-60s-TOR-NonTOR.csv	94.44%	99.94%		99.91%			99.85%
TimeBasedFeatures-120s-TOR-NonTOR.csv	95.82%	99.96%		99.92%			99.85%
