In [None]:
%%capture
from datetime import datetime
import time
from os import path, getcwd, system, mkdir
from shutil import rmtree
import csv
import scipy
import numpy as np
from numpy import array
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow.compat.v1 as tf
import sklearn as sk
import keras
from keras.models import Sequential, load_model
from keras.layers import LSTM, Masking
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Will clear tensorflow graph (so that brand new model is created)
tf.keras.backend.clear_session()
tf.reset_default_graph()
tf.logging.set_verbosity(tf.logging.ERROR)

# Remove last rounds output files/stats
system('rm -rf tests/previous/*; mv tests/current/* tests/previous')

In [None]:
# Load training data
df = pd.read_csv(path.join(getcwd(), "training_data/features.csv"))

# Convert TLD to category codes
df["tld"] = df["tld"].astype("category").cat.codes

# Scale data between 0 and 1
scaler = MinMaxScaler()
features_to_scale = df.copy().drop(['classification', 'sample', 'redir_no'], axis=1)
normalised = pd.DataFrame(scaler.fit_transform(features_to_scale), columns=features_to_scale.columns, index=features_to_scale.index)

# Rebuild normalised dataframe
df = pd.concat([df[['classification', 'sample', 'redir_no']], normalised], axis=1)

In [None]:
# Pad out the groups e.g. if max number of nodes is 50, pad out each group until it has 50 rows
df_padded = df.set_index(['sample','redir_no']).unstack(fill_value=0).stack(dropna=False).reset_index('sample')

# Number of samples
num_of_samples = len(df_padded.groupby('sample'))
# Find the max number of nodes in any chain
max_nodes = int(len(df_padded) / num_of_samples)
# Number of features per chain
features_per_node = len(df.columns) - 3 # -3 as classification + sample + redir_no will be dropped later

# Assign Y to equal classification column (0/1)
y = df_padded[['classification', 'sample']][0::max_nodes].copy() # Once every 'max_nodes'
# Assign X to equal the remaining columns (features)
X = df_padded.copy().drop(['classification', 'sample'], axis=1)

# Reshape the rows: samples/time_steps/features
X = array(X).reshape(num_of_samples, max_nodes, features_per_node)

# Backup y as we may want to access sample name
classifications = y.copy().reset_index(drop=True)
# Drop the sample names from y (we only want classification)
y = y.drop(['sample'], axis=1).reset_index(drop=True)

In [None]:
def split_data(seed):
    # Split up the train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_set_size, random_state=seed, shuffle=True)

    # Split up the train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_set_size, random_state=seed, shuffle=True)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def create_model(halve_nodes):
    # Create a sequential model
    model = Sequential(name=model_name)

    # Add masking layer to ignore all timesteps where every value equals 0
    model.add(Masking(mask_value=0., input_shape=(max_nodes, features_per_node)))

    # Used to divide number of nodes per layer if required
    temp_nodes = hidden_nodes
    neurons = [] # Store the number of hidden nodes used in each layer

    for layer in range(num_of_layers-1):
        neurons.append(temp_nodes) 
        # Add LSTM layer with 'hidden_nodes' * neurons
        # Uses 'Tanh' activation function by default
        model.add(LSTM(temp_nodes, return_sequences=True, input_shape=(max_nodes, features_per_node))) # return_sequences true if multi-layers
        if halve_nodes == 1:
            # If we can still halve the temp nodes, do so
            if int(temp_nodes / 2) >= 1:
                # Halve temp nodes
                temp_nodes = int(temp_nodes / 2)
        # Add dropout to prevent overfitting
        model.add(Dropout(dropout))

    # Final layer (don't return sequences)
    # Uses 'Tanh' activation function by default
    model.add(LSTM(temp_nodes, input_shape=(max_nodes, features_per_node))) 
    # Add the final neurons value
    neurons.append(temp_nodes)
    # Add dropout to prevent overfitting
    model.add(Dropout(dropout))

    # Classification problem, Dense output layer with a single neuron and sigmoid activation function to make 0/1 predictions
    model.add(Dense(1))

    # Add activation layer - 'sigmoid' for binary classification (backed up by: https://www.quora.com/Why-is-it-better-to-use-Softmax-function-than-sigmoid-function)
    model.add(Activation('sigmoid'))

    # Classification problem, cross entropy - https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/ 
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Early stopping can be used to interupt training when the best validation loss has not improved for 'patience_no' epochs
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience_no, restore_best_weights=True)

    # Model checkpoint will ensure only the best model is saved (every patience_no epochs)
    model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=test_dir + 'model.h5', monitor='val_loss', period=1, save_best_only=True, save_weights_only=False)

    # Print the model summary
    # model.summary()

    # We don't specify batch size for sequences - https://keras.io/models/model/#fit
    history = model.fit(X_train, y_train, validation_data=[X_val, y_val], epochs=epochs_no, callbacks=[early_stopping, model_checkpoint], shuffle=True, verbose=0)

    # If early stopping didn't occur..
    if early_stopping.stopped_epoch == 0:
        model = load_model(test_dir + 'model.h5')
        best_epoch = epochs_no - 1
        stopped_epoch = epochs_no - 1
    else:
        # Set the model to equal the best model we found during training
        model = early_stopping.model
        best_epoch = early_stopping.stopped_epoch - patience_no
        stopped_epoch = early_stopping.stopped_epoch

    # Final evaluation of the model
    results = model.evaluate(X_test, y_test, verbose=0)

    # Lets get some stats
    stats = {
        'test_acc' : round(results[1]*100,3),
        'test_loss' : round(results[0]*100,3),
        'train_acc_best' : round(history.history['accuracy'][best_epoch]*100,3),
        'train_acc_avg' : round(np.average(history.history['accuracy'])*100,3),
        'train_loss_best' : round(history.history['loss'][best_epoch]*100,3),
        'train_loss_avg' : round(np.average(history.history['loss'])*100,3),
        'val_acc_best' : round(history.history['val_accuracy'][best_epoch]*100,3),
        'val_acc_avg' : round(np.average(history.history['val_accuracy'])*100,3),
        'val_loss_best' : round(history.history['val_loss'][best_epoch]*100,3),
        'val_loss_avg' : round(np.average(history.history['val_loss'])*100,3),
        'neurons_per_layer': neurons,
        'best_epoch': best_epoch,
        'stopped_epoch': stopped_epoch
    }

    # Print test stats
    print("=================================\n" + str(num_of_layers) + " Layers, " + str(stats['neurons_per_layer']) + " Nodes")
    print("Best Epoch: " + str(stats['best_epoch']))
    print("Validation Accuracy: %.2f%%" % (stats['val_acc_best']))
    print("Validation Loss: %.2f%%" % (stats['val_loss_best']))
    print("Test Accuracy: %.2f%%" % (stats['test_acc']))
    print("Test Loss: %.2f%%\n" % (stats['test_loss']))

    return stats, model, results, history

In [None]:
# Log the ML training result to CSV
def log_result():
    # If the file exists
    if path.isfile('results/results.csv'):
        with open ('results/results.csv','a') as f:
            # Write results as a new row
            writer = csv.writer(f, delimiter=',')
            writer.writerow([datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), stats['test_acc'], stats['test_loss'], stats['val_acc_best'], stats['val_acc_avg'], stats['val_loss_best'], stats['val_loss_avg'], stats['train_acc_best'], stats['train_acc_avg'], stats['train_loss_best'], stats['train_loss_avg'], len(y_train), len(y_val), len(y_test), hidden_nodes, stats['neurons_per_layer'], num_of_layers, patience_no, dropout, epochs_no, stats['stopped_epoch'], stats['best_epoch'], stats['train_time']])
    else:
        # If the file doesnt exist
        with open ('results/results.csv','w') as f:     
            # Create new CSV with following headings                       
            writer = csv.writer(f, delimiter=',')
            writer.writerow(['date_time', 'test_acc', 'test_loss', 'val_acc_best', 'val_acc_avg', 'val_loss_best', 'val_loss_avg', 'train_acc_best', 'train_acc_avg', 'train_loss_best', 'train_loss_avg', 'train_size', 'val_size' , 'test_size', 'max_neurons', 'neurons_per_layer', 'layers', 'patience', 'dropout', 'max_epochs', 'stopped_epoch', 'best_epoch', 'train_time'])
            # Print the new result row
            writer.writerow([datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), stats['test_acc'], stats['test_loss'], stats['val_acc_best'], stats['val_acc_avg'], stats['val_loss_best'], stats['val_loss_avg'], stats['train_acc_best'], stats['train_acc_avg'], stats['train_loss_best'], stats['train_loss_avg'], len(y_train), len(y_val), len(y_test), hidden_nodes, stats['neurons_per_layer'], num_of_layers, patience_no, dropout, epochs_no, stats['stopped_epoch'], stats['best_epoch'], stats['train_time']])

In [None]:
# Size of test set
test_set_size = 0.15
# This is actually 0.15 of overall data but because the test set has already been removed, must increase to 0.1765
val_set_size = 0.1765 # 0.1764705882352941 x 0.85 = 0.15

# Number of units - https://ai.stackexchange.com/questions/3156/how-to-select-number-of-hidden-layers-and-number-of-memory-cells-in-an-lstm
# https://www.researchgate.net/post/How_should_I_choose_the_optimum_number_for_the_neurons_in_the_input_hidden_layer_for_a_recurrent_neural_network 
# https://towardsdatascience.com/choosing-the-right-hyperparameters-for-a-simple-lstm-using-keras-f8e9ed76f046 
# https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
# TODO: Tried this formula as per one of links above had better results using max_nodes
# hidden_nodes = int(2/3 * (max_nodes * features_per_node))
hidden_nodes_loop = [0,50] # [0,100] == 1-100, [49,100] == 50-100, [1,2] == 2

# Helps prevent overfitting - typically in range 0.2-0.5 (0.x probability that each feature will be dropped)
# Works well because model can't rely on any single feature too much (they get randomly dropped)
dropout = 0.2

# How many LSTM layers?
num_of_layers_loop = [0,5] # [0,1] == 1 layer, [1,2] == 2 layer, [0,2] == 1+2 layer etc.

# Epoch - https://machinelearningmastery.com/difference-between-a-batch-and-an-epoch/ 
epochs_no = 500

# Patience is used in earlystopping/modelcheckpoint - Has the model improved in last 'patience_no' epochs?
patience_no = 100

# Can use this to divide hidden nodes as number of layers increase (HALF-NODES)
# e.g. for 3 layers, 40 nodes: first layer has 40 nodes, second layer has 20, third layer has 10
half_nodes_loop = [0,1] # change between [0,1], [1,2] and [0,2] for division of layers (dont divide, divide, both)

# How many iterations of the same config
iterations = 3

# Change this to range of whatever variable we want to iterate over
# Or add embedded loops to iterate over multiple variables
for i in range(0, iterations):
    # Create a new training/validation/test distribution
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(i + 1)
    for j in range(num_of_layers_loop[0], num_of_layers_loop[1]): 
        num_of_layers = j + 1
        for k in range(hidden_nodes_loop[0], hidden_nodes_loop[1]): # change the 0 if only testing 1 layer
            hidden_nodes = k + 1
            for l in range(half_nodes_loop[0], half_nodes_loop[1]):
                # If using halve_nodes, we only want to process if we can divide the nodes in half
                # And, if there is more than 1 layer
                if (l == 1 and int(hidden_nodes / 2) > 1 and num_of_layers > 1) or l == 0:

                    # Create the test directory
                    model_name = str(num_of_layers) + '_layers_' + str(hidden_nodes) + '_nodes_' + str(l) + '_halve_' + str(i)
                    test_dir = 'tests/current/' + model_name + "/"
                    mkdir(test_dir)

                    t_start = time.time() # Track time of modelling
                    stats, model, results, history = create_model(l) # Create and run the model
                    stats['train_time'] = round((time.time() - t_start),3) # Store time taken

                    log_result() # Log statistics of this result to CSV

                    tf.keras.backend.clear_session() # Clear any ML stuff
                    del stats, model, results, history # Clear variables
                    rmtree(test_dir) # Remove the test directory