In [1]:
%%capture
from os import path, getcwd, system, mkdir
from datetime import datetime
from shutil import rmtree
import csv
import numpy as np
from numpy import array
import pandas as pd
import tensorflow.compat.v1 as tf
import sklearn as sk
import keras
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import LSTM, Masking
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import GridSearchCV, cross_validate, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

# Will clear tensorflow graph (so that brand new model is created)
tf.keras.backend.clear_session()
tf.reset_default_graph()
tf.logging.set_verbosity(tf.logging.ERROR)

In [2]:
# Load training data
df = pd.read_csv(path.join(getcwd(), "training_data/features.csv"))

# Convert TLD to category codes
df["tld"] = df["tld"].astype("category").cat.codes

# Scale data between 0 and 1
scaler = MinMaxScaler()

# All features
features_to_scale = df.copy().drop(['classification', 'sample', 'redir_no'], axis=1)

# Redirect features
#features_to_scale = df.copy().drop(['classification', 'sample', 'redir_no', 'requests_no', 'port_80', 'domain_is_ip', 'domain_len_avg', 'domain_entropy_avg', 'uri_len_avg', 'uri_entropy_avg', 'uri_ch_slash_total', 'uri_ch_slash_avg', 'uri_ch_amp_total', 'uri_ch_amp_avg','uri_ch_dash_total', 'uri_ch_dash_avg', 'uri_ch_plus_total', 'uri_ch_plus_avg', 'response_len_total', 'response_len_avg', 'bytes_shockwave_total', 'bytes_shockwave_avg', 'bytes_x-dosexec_total', 'bytes_x-dosexec_avg', 'bytes_java_total', 'bytes_java_avg', 'bytes_silverlight_total', 'bytes_silverlight_avg', 'bytes_javascript_total', 'bytes_javascript_avg', 'bytes_xml_total', 'bytes_xml_avg', 'bytes_zip_total', 'bytes_zip_avg', 'bytes_image_total', 'bytes_image_avg', 'bytes_html_total', 'bytes_html_avg', 'tld'], axis=1)

# URL features
#features_to_scale = df.copy().drop(['classification', 'sample', 'redir_no', 'redir_time', 'node_depth', 'requests_no', 'response_len_total', 'response_len_avg', 'bytes_shockwave_total', 'bytes_shockwave_avg', 'bytes_x-dosexec_total', 'bytes_x-dosexec_avg', 'bytes_java_total', 'bytes_java_avg', 'bytes_silverlight_total', 'bytes_silverlight_avg', 'bytes_javascript_total', 'bytes_javascript_avg', 'bytes_xml_total', 'bytes_xml_avg', 'bytes_zip_total', 'bytes_zip_avg', 'bytes_image_total', 'bytes_image_avg', 'bytes_html_total', 'bytes_html_avg', 'redir_referrer', 'redir_location', 'redir_html', 'redir_js', 'redir_iframe','redir_subdomain', 'redir_concat', 'redir_base64', 'redir_unknown'], axis=1)

# Content features
#features_to_scale = df.copy().drop(['classification', 'sample', 'redir_no', 'redir_time', 'node_depth', 'port_80', 'domain_is_ip', 'domain_len_avg', 'domain_entropy_avg', 'uri_len_avg', 'uri_entropy_avg', 'uri_ch_slash_total', 'uri_ch_slash_avg', 'uri_ch_amp_total', 'uri_ch_amp_avg','uri_ch_dash_total', 'uri_ch_dash_avg', 'uri_ch_plus_total', 'uri_ch_plus_avg', 'redir_referrer', 'redir_location', 'redir_html', 'redir_js', 'redir_iframe','redir_subdomain', 'redir_concat', 'redir_base64', 'redir_unknown', 'tld'], axis=1)

# Content features - without totals (only averages)
#features_to_scale = df.copy().drop(['classification', 'sample', 'redir_no', 'redir_time', 'node_depth', 'port_80', 'domain_is_ip', 'domain_len_avg', 'domain_entropy_avg', 'uri_len_avg', 'uri_entropy_avg', 'uri_ch_slash_total', 'uri_ch_slash_avg', 'uri_ch_amp_total', 'uri_ch_amp_avg','uri_ch_dash_total', 'uri_ch_dash_avg', 'uri_ch_plus_total', 'uri_ch_plus_avg', 'redir_referrer', 'redir_location', 'redir_html', 'redir_js', 'redir_iframe','redir_subdomain', 'redir_concat', 'redir_base64', 'redir_unknown', 'tld', 'response_len_total', 'bytes_shockwave_total', 'bytes_x-dosexec_total', 'bytes_java_total', 'bytes_silverlight_total',  'bytes_javascript_total', 'bytes_xml_total', 'bytes_zip_total', 'bytes_image_total', 'bytes_html_total'], axis=1)

# Normalise
normalised = pd.DataFrame(scaler.fit_transform(features_to_scale), columns=features_to_scale.columns, index=features_to_scale.index)

# Rebuild normalised dataframe
df = pd.concat([df[['classification', 'sample', 'redir_no']], normalised], axis=1)

In [3]:
# Pad out the groups e.g. if max number of nodes is 50, pad out each group until it has 50 rows
df_padded = df.set_index(['sample','redir_no']).unstack(fill_value=0).stack(dropna=False).reset_index('sample')

# Number of samples
num_of_samples = len(df_padded.groupby('sample'))
# Find the max number of nodes in any chain
max_nodes = int(len(df_padded) / num_of_samples)
# Number of features per chain
features_per_node = len(df.columns) - 3 # -3 as classification + sample + redir_no will be dropped later

# Assign Y to equal classification column (0/1)
y = df_padded[['classification', 'sample']][0::max_nodes].copy() # Once every 'max_nodes'
# Assign X to equal the remaining columns (features)
X = df_padded.copy().drop(['classification', 'sample'], axis=1)

In [4]:
# Reshape the rows: samples/time_steps/features
X = array(X).reshape(num_of_samples, max_nodes, features_per_node)

# Backup y as we may want to access sample name
classifications = y.copy().reset_index(drop=True)

# Drop the sample names from y (we only want classification)
y = y.drop(['sample'], axis=1)
# Convert y to numpy array so it can be processed by gridsearchcv
y = array(y).reshape(num_of_samples, 1)

In [5]:
def build_model(neurons=1, layers=1, dropout=0.2):
    # Create a sequential model
    model = Sequential()

    # Add masking layer to ignore all timesteps where every value equals 0
    model.add(Masking(mask_value=0., input_shape=(max_nodes, features_per_node)))

    for layer in range(layers-1):
        # Uses 'Tanh' activation function by default
        model.add(LSTM(neurons, return_sequences=True, input_shape=(max_nodes, features_per_node))) # return_sequences true if multi-layers
        # Add dropout to prevent overfitting
        model.add(Dropout(dropout))

    # Final layer (don't return sequences)
    # Uses 'Tanh' activation function by default
    model.add(LSTM(neurons, input_shape=(max_nodes, features_per_node))) 
    # Add dropout to prevent overfitting
    model.add(Dropout(dropout))

    # Classification problem, Dense output layer with a single neuron and sigmoid activation function to make 0/1 predictions
    model.add(Dense(1))

    # Add activation layer - 'sigmoid' for binary classification (backed up by: https://www.quora.com/Why-is-it-better-to-use-Softmax-function-than-sigmoid-function)
    model.add(Activation('sigmoid'))

    # Classification problem, cross entropy - https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/ 
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [6]:
# Log the ML training result to CSV
def log_result(layers, neurons, epochs, dropout, accuracy, precicion, recall, f1, rank, fit_time, test_time):
    # If the file exists
    if path.isfile('results/results.csv'):
        with open ('results/results.csv','a') as f:
            # Write results as a new row
            writer = csv.writer(f, delimiter=',')
            # Print the new result row
            writer.writerow([datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), round(accuracy,3), round(precicion,3), round(recall,3), round(f1,3), len(X), test_split, val_split, neurons, layers, dropout, epochs, rank, round(fit_time,3), round(test_time,3)])
    else:
        # If the file doesnt exist
        with open ('results/results.csv','w') as f:
            # Create new CSV with following headings                       
            writer = csv.writer(f, delimiter=',')
            writer.writerow(['date_time', 'accuracy', 'precision', 'recall', 'f1', 'data_size', 'test_folds', 'val_folds', 'neurons', 'layers', 'dropout', 'epochs', 'rank', 'fit_time', 'test_time'])
            # Print the new result row
            writer.writerow([datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), round(accuracy,3), round(precicion,3), round(recall,3), round(f1,3), len(X), test_split, val_split, neurons, layers, dropout, epochs, rank, round(fit_time,3), round(test_time,3)])

In [7]:
# Val/Test splits
test_split = 5
val_split = 5

# How many LSTM layers?
num_of_layers = [1, 1]
layers = list(range(num_of_layers[0], num_of_layers[1]+1))
# layers = layers[5:] # Only want layers 5-10?

# Number of hidden neurons
num_of_nodes = [1, 1]
neurons = list(range(num_of_nodes[0], num_of_nodes[1]+1))
# neurons = neurons[20:] # Only want layers 5-10?

# Helps prevent overfitting - typically in range 0.2-0.5 (0.x probability that each feature will be dropped)
# Works well because model can't rely on any single feature too much (they get randomly dropped)
dropout = [0.2]

# Epoch - https://machinelearningmastery.com/difference-between-a-batch-and-an-epoch/
num_of_epochs = [267, 267]
epochs = list(range(num_of_epochs[0], num_of_epochs[1]+1))

# Paramters to grid search
param_grid = dict(neurons=neurons, layers=layers, epochs=epochs, dropout=dropout)

# Statistics
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Template to build Keras Classifier (call custom build_model function)
model = KerasClassifier(build_fn=build_model)

# Perform grid search with 'val_split' folds
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=StratifiedKFold(n_splits=val_split, shuffle=True, random_state=23), n_jobs=-1, scoring=scoring, refit='f1', verbose=10)

# # We don't specify batch size for sequences - https://keras.io/models/model/#fit
# grid.fit(X, y, shuffle=True, verbose=1)

# # Print best score
# print('\nBest parameters: ' + str(grid.best_params_))
# print('Best score: %0.3f' % (grid.best_score_ * 100) + '%')

# # Print results of grid search to CSV file
# for i in range(len(grid.cv_results_['params'])):
#     log_result(grid.cv_results_['params'][i]['layers'], grid.cv_results_['params'][i]['neurons'], grid.cv_results_['params'][i]['epochs'], grid.cv_results_['params'][i]['dropout'], grid.cv_results_['mean_test_accuracy'][i], grid.cv_results_['mean_test_precision'][i], grid.cv_results_['mean_test_recall'][i], grid.cv_results_['mean_test_f1'][i], grid.cv_results_['rank_test_f1'][i], grid.cv_results_['mean_fit_time'][i], grid.cv_results_['mean_score_time'][i])

# Perform cross fold validation on test set
results = cross_validate(estimator=grid, X=X, y=y, cv=StratifiedKFold(n_splits=test_split, shuffle=True, random_state=23), n_jobs=-1, scoring=scoring, verbose=1)

# Print results
print("\nAccuracy " + str(results['test_accuracy']))
print("Precision " + str(results['test_precision']))
print("Recall " + str(results['test_recall']))
print("F1 " + str(results['test_f1']))

print("\nAverage Acurracy: " + str(round(np.average(results['test_accuracy'] * 100),3)))
print("Average Precision: " + str(round(np.average(results['test_precision'] * 100),3)))
print("Average Recall: " + str(round(np.average(results['test_recall'] * 100),3)))
print("Average F1: " + str(round(np.average(results['test_f1'] * 100),3)))

print("\nAverage Fit Time: " + str(round(np.average(results['fit_time']),3)) + " secs")
print("Average Test Time: " + str(round(np.average(results['score_time']),3)) + " secs\n")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.

Accuracy [0.99442897 0.99373259 0.99233983 0.99651811 0.99512535]
Precision [0.98809524 0.98804781 0.98406375 0.99209486 0.98814229]
Recall [0.98031496 0.97637795 0.97244094 0.98818898 0.98425197]
F1 [0.98418972 0.98217822 0.97821782 0.99013807 0.98619329]

Average Acurracy: 99.443
Average Precision: 98.809
Average Recall: 98.031
Average F1: 98.418

Average Fit Time: 2764.119 secs
Average Test Time: 0.242 secs

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 46.2min finished
