# Experiment 3: Additional Data

In [None]:
from os import chdir, getcwd

if not getcwd().lower().endswith("gb-birp"):
    chdir("..")

In [None]:
run_id = "additional_data"
batch_size = 64
epochs = 50
initial_lr = 1e-3
end_lr = 1e-5

In [None]:
import math
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
%load_ext tensorboard
from tensorflow.keras import Model
from tensorflow.keras.layers import LSTM, Dropout, Dense, Concatenate, Conv1D, Flatten, Conv2D
import src.data.utils as data_utils
import src.prediction.eval_tools as eval_tools


tf.random.set_seed(17)

print("Available GPUs: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
def get_windowified_dataset(scale_weights: bool) -> tuple:
    # Generate basic datasets.
    train_dates = pd.date_range("01/01/2016", "31/12/2018")
    test_dates = pd.date_range("01/01/2019", "31/12/2019")
    train_grid = get_dataset(train_dates)
    test_grid = get_dataset(test_dates)

    norm_train_grid, norm_test_grid = normalize_dataset(train_grid, test_grid)

    train_inputs, train_labels = data_utils.generate_data_windows(
        norm_train_grid, train_grid, input_timesteps=7)
    test_inputs, test_labels = data_utils.generate_data_windows(
        norm_test_grid, test_grid, input_timesteps=7)

    # One-Hot Encode Labels.
    train_labels = one_hot_encode_labels(train_labels)
    test_labels = one_hot_encode_labels(test_labels)

    # Get sample weights.
    sample_weights = data_utils.calculate_sample_weights(data=(train_inputs,
                                                               train_labels),
                                                         scale=scale_weights)

    return train_inputs, train_labels, test_inputs, test_labels, sample_weights


def get_dataset(dates: pd.DatetimeIndex) -> tuple:
    data = data_utils.get_dataset(
        date_range=dates,
        auxiliary_data=["weather", "events"],
        encode_event_data=True,
    )
    return data


def one_hot_encode_labels(raw_labels: np.ndarray) -> np.ndarray:
    new_labels = np.empty([len(raw_labels), 2], dtype=np.int8)
    for i, label in enumerate(raw_labels):
        if label == 0:
            new_labels[i] = np.asarray([1, 0], dtype=np.int8)
        else:
            new_labels[i] = np.asarray([0, 1], dtype=np.int8)
    return new_labels


def normalize_dataset(train_grid: pd.DataFrame,
                      test_grid: pd.DataFrame) -> tuple:
    # Normalize breakin values. We normalize on the training data maximum.
    maximum_breakins = data_utils.determine_global_max(train_grid)
    norm_train_grid = data_utils.scale_breakin_values(
        train_grid.copy(deep=True), maximum_breakins)
    norm_test_grid = data_utils.scale_breakin_values(test_grid.copy(deep=True),
                                                     maximum_breakins)

    # Normalize weather data.
    norm_train_grid, norm_test_grid = data_utils.scale_weather_values(
        norm_train_grid, norm_test_grid)

    return norm_train_grid, norm_test_grid

In [None]:
input_train, labels_train, input_test, labels_test, sample_weights = get_windowified_dataset(
    scale_weights=True)


In [None]:
def run_through_training_pipeline(
    log_base_directory: str,
    run_id: int,
    model: Model,
    input_train: np.ndarray,
    labels_train: np.ndarray,
    input_test: np.ndarray,
    labels_test: np.ndarray,
    sample_weights: np.ndarray,
    batch_size: int,
    epochs: int,
    initial_lr: float,
    end_lr: float,
):
    log_dir = f"logs/binary_classification/{log_base_directory}/run_{run_id}"

    decay_steps = math.floor(input_train[0].shape[0] / batch_size) * epochs

    lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=initial_lr,
        end_learning_rate=end_lr,
        decay_steps=decay_steps,
    )

    model.compile(
        # loss=tf.keras.losses.BinaryFocalCrossentropy(),
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=tf.optimizers.Adam(learning_rate=lr_schedule),
        metrics=["accuracy"],
    )

    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                         patience=10,
                                         mode='min'),
        tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    ]

    model.fit(x=input_train,
              y=labels_train,
              shuffle=True,
              batch_size=batch_size,
              validation_data=(input_test, labels_test),
              epochs=epochs,
              callbacks=callbacks)

    predictions_test = model.predict(input_test)
    eval_tools.calculate_metrics(predictions_test, labels_test)


In [None]:
class Base_Dense_Classifier(Model):
    """
    A Single-Timestep Dense classifier classifier.
    """

    def __init__(self):
        super(Base_Dense_Classifier, self).__init__()
        self.input_breakins = Dense(units=25)
        self.input_date = Dense(units=2)
        self.input_weather = Dense(units=11)
        self.input_events = Dense(units=9)
        self.input_target_cell = Dense(units=25)
        self.concat = Concatenate()
        self.hidden_layer_1 = Dense(units=47, activation="ReLU")
        self.dropout_layer_1 = Dropout(rate=0.8)
        self.hidden_layer_2 = Dense(units=25, activation="ReLU")
        self.dropout_layer_2 = Dropout(rate=0.5)
        self.hidden_layer_3 = Dense(units=10, activation="ReLU")
        self.output_layer = Dense(units=2, activation="softmax")

    def call(self, inputs):
        """
        Runs input data through the Neural Network.
        """
        input_breakins = self.input_breakins(inputs[0])
        input_date = self.input_date(inputs[1])
        input_weather = self.input_weather(inputs[2])
        input_events = self.input_events(inputs[3])
        input_targets = self.input_target_cell(inputs[4])
        data = self.concat([
            input_breakins, input_date, input_weather, input_events,
            input_targets
        ])
        data = self.hidden_layer_1(data)
        data = self.dropout_layer_1(data)
        data = self.hidden_layer_2(data)
        data = self.dropout_layer_2(data)
        data = self.hidden_layer_3(data)
        return self.output_layer(data)

In [None]:
def transform_to_single_step(inputs: list) -> np.ndarray:
    """
    Takes a regular dataset (windowified for multiple input timesteps) and instead turns it into
    single timestep inputs.
    """
    single_timestep_inputs = []
    # Iterate over inputs, which is a list of ndarrays.
    for input_array in inputs:
        # Skip target cell input array because it has no time window dimension.
        if len(input_array.shape) == 2:
            single_timestep_inputs.append(input_array)
            continue
        # Only use last timestep of each data window and reduce dimensionality by 1.
        reduced_input = input_array[:, -1, :].reshape(
            [input_array.shape[0], input_array.shape[-1]])
        single_timestep_inputs.append(reduced_input)
    return single_timestep_inputs

In [None]:
run_through_training_pipeline(
    log_base_directory="basic_dense",
    model=Base_Dense_Classifier(),
    run_id=run_id,
    input_train=transform_to_single_step(input_train),
    labels_train=labels_train,
    input_test=transform_to_single_step(input_test),
    labels_test=labels_test,
    sample_weights=sample_weights,
    batch_size=batch_size,
    epochs=epochs,
    initial_lr=initial_lr,
    end_lr=end_lr)


In [None]:
from tensorflow import expand_dims


class Multi_Input_Dense_Classifier(Model):
    """
    Takes multiple timesteps in the same dense input layer and tries to predict whether there will
    be a break-in on the next day.
    """

    def __init__(self):
        super(Multi_Input_Dense_Classifier, self).__init__()
        self.input_breakins_day_0 = Dense(units=25)
        self.input_breakins_day_1 = Dense(units=25)
        self.input_breakins_day_2 = Dense(units=25)
        self.input_breakins_day_3 = Dense(units=25)
        self.input_breakins_day_4 = Dense(units=25)
        self.input_breakins_day_5 = Dense(units=25)
        self.input_breakins_day_6 = Dense(units=25)

        self.input_date = Dense(units=2)
        self.input_weather = Dense(units=11)
        self.input_events = Dense(units=9)
        self.input_target_cell = Dense(units=25)
        self.concatenate = Concatenate()

        self.hidden_layer_1 = Dense(units=221, activation="relu")
        self.dropout_layer_1 = Dropout(rate=0.8)
        self.hidden_layer_2 = Dense(units=50, activation="relu")
        self.dropout_layer_2 = Dropout(rate=0.5)
        self.hidden_layer_3 = Dense(units=27, activation="relu")
        self.output_layer = Dense(units=2, activation="softmax")

    def call(self, inputs):
        x0 = self.input_breakins_day_0(inputs[0][:, 0])
        x1 = self.input_breakins_day_1(inputs[0][:, 1])
        x2 = self.input_breakins_day_2(inputs[0][:, 2])
        x3 = self.input_breakins_day_3(inputs[0][:, 3])
        x4 = self.input_breakins_day_4(inputs[0][:, 4])
        x5 = self.input_breakins_day_5(inputs[0][:, 5])
        x6 = self.input_breakins_day_6(inputs[0][:, 6])
        input_date = self.input_date(inputs[1])
        input_weather = self.input_weather(inputs[2])
        input_events = self.input_events(inputs[3])
        target = self.input_target_cell(inputs[4])
        input_breakins = self.concatenate([x0, x1, x2, x3, x4, x5, x6, target])
        data = self.concatenate([
            input_breakins,
            input_date,
            input_weather,
            input_events,
        ])
        x = self.hidden_layer_1(data)
        x = self.dropout_layer_1(x)
        x = self.hidden_layer_2(x)
        x = self.dropout_layer_2(x)
        x = self.hidden_layer_3(x)
        return self.output_layer(x)

In [None]:
run_through_training_pipeline(log_base_directory="multi_input_dense",
                              run_id=run_id,
                              model=Multi_Input_Dense_Classifier(),
                              input_train=input_train,
                              labels_train=labels_train,
                              input_test=input_test,
                              labels_test=labels_test,
                              sample_weights=sample_weights,
                              batch_size=batch_size,
                              epochs=epochs,
                              initial_lr=initial_lr,
                              end_lr=end_lr)

In [None]:
test1 = tf.random.normal([1, 25])
test2 = tf.random.normal([1, 25])
test3 = tf.stack([test1, test2], axis=1)
test3.shape

In [None]:
class Conv1D_Classifier(Model):
    """
    Takes multiple timesteps in a convolutional layer and tries to predict whether there will be a
    break-in on the next day.
    """

    def __init__(self):
        super(Conv1D_Classifier, self).__init__()
        self.input_crimes = Conv1D(strides=25,
                                   filters=50,
                                   activation="relu",
                                   kernel_size=7)
        self.crime_flatten = Flatten()

        self.input_date = Dense(units=2)
        self.input_weather = Dense(units=11)
        self.input_events = Dense(units=9)
        self.input_target_cell = Dense(units=25)
        self.concatenate = Concatenate()

        self.hidden_layer_1 = Dense(units=100, activation="relu")
        self.dropout_layer_1 = Dropout(rate=0.8)
        self.hidden_layer_2 = Dense(units=50, activation="relu")
        self.dropout_layer_2 = Dropout(rate=0.5)
        self.hidden_layer_3 = Dense(units=25, activation="relu")
        self.output_layer = Dense(units=2, activation="softmax")

    def call(self, inputs):
        breakins = self.input_crimes(inputs[0])
        breakins = self.crime_flatten(breakins)
        date = self.input_date(inputs[1])
        weather = self.input_weather(inputs[2])
        events = self.input_events(inputs[3])
        target_cell = self.input_target_cell(inputs[4])

        x = self.concatenate([breakins, date, weather, events, target_cell])
        x = self.hidden_layer_1(x)
        x = self.dropout_layer_1(x)
        x = self.hidden_layer_2(x)
        x = self.dropout_layer_2(x)
        x = self.hidden_layer_3(x)
        return self.output_layer(x)

In [None]:
run_through_training_pipeline(log_base_directory="conv1d",
                              run_id=run_id,
                              model=Conv1D_Classifier(),
                              input_train=input_train,
                              labels_train=labels_train,
                              input_test=input_test,
                              labels_test=labels_test,
                              sample_weights=sample_weights,
                              batch_size=batch_size,
                              epochs=epochs,
                              initial_lr=initial_lr,
                              end_lr=end_lr)

In [None]:
class Conv2D_Classifier(Model):
    """
    Takes multiple timesteps in a convolutional layer and tries to predict whether there will be a
    break-in on the next day.
    """

    def __init__(self):
        super(Conv2D_Classifier, self).__init__()
        self.input_crimes = Conv2D(input_shape=(7, 25),
                                   data_format="channels_first",
                                   strides=1,
                                   filters=32,
                                   activation="relu",
                                   kernel_size=(3, 3),
                                   name="input_crimes")
        self.crime_flatten = Flatten(name="flatten")

        self.input_date = Dense(units=2)
        self.input_weather = Dense(units=11)
        self.input_events = Dense(units=9)
        self.input_target_cell = Dense(units=25)
        self.concatenate = Concatenate()

        self.hidden_layer_1 = Dense(units=100, activation="relu", name="hl1")
        self.dropout_layer_1 = Dropout(rate=0.8, name="dropout")
        self.hidden_layer_2 = Dense(units=50, activation="relu", name="hl2")
        self.dropout_layer_2 = Dropout(rate=0.5, name="dropout2")
        self.hidden_layer_3 = Dense(units=25, activation="relu")
        self.output_layer = Dense(units=2, activation="softmax", name="output")

    def call(self, inputs):
        breakins = self.input_crimes(inputs[0])
        breakins = self.crime_flatten(breakins)

        date = self.input_date(inputs[1])
        weather = self.input_weather(inputs[2])
        events = self.input_events(inputs[3])
        target_cell = self.input_target_cell(inputs[4])

        x = self.concatenate([breakins, date, weather, events, target_cell])
        x = self.hidden_layer_1(x)
        x = self.dropout_layer_1(x)
        x = self.hidden_layer_2(x)
        x = self.dropout_layer_2(x)
        x = self.hidden_layer_3(x)
        return self.output_layer(x)

In [None]:
# Add an additional dimension to the data for the channel (only one in this case).
# Assumes data_format is channels_last (not the default).
train_shape = input_train[0].shape
test_shape = input_test[0].shape
conv_input_train = (input_train[0].reshape([
    train_shape[0], 1, train_shape[1], train_shape[2]
]), input_train[1], input_train[2], input_train[3], input_train[4])
conv_input_test = (input_test[0].reshape([
    test_shape[0], 1, test_shape[1], test_shape[2]
]), input_test[1], input_test[2], input_test[3], input_test[4])

run_through_training_pipeline(log_base_directory="conv2d",
                              run_id=run_id,
                              model=Conv2D_Classifier(),
                              input_train=conv_input_train,
                              labels_train=labels_train,
                              input_test=conv_input_test,
                              labels_test=labels_test,
                              sample_weights=sample_weights,
                              batch_size=batch_size,
                              epochs=epochs,
                              initial_lr=initial_lr,
                              end_lr=end_lr)