In [1]:
# Imports
import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Finn-Anderson/BscHons-Data-Analytics/main/coldeepneuralnetworkdata.csv', index_col = 0, )

In [3]:
df

Unnamed: 0,Apr,Aug,Dec,Feb,Jan,Jul,Jun,Mar,May,Nov,...,Sat,Sun,Thu,Tue,Wed,year,temp,wdsp,fog,NUM_COLLISIONS
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,2020,40.3,11.4,0,0.340174
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,2020,39.6,13.0,0,0.948172
3,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,2020,45.8,10.3,0,1.338380
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,2020,45.4,5.7,1,0.403696
5,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,2020,40.1,17.0,1,-0.158930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1457,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,2023,45.9,4.0,1,-0.955761
1458,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,2023,52.1,10.9,1,0.264004
1459,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,2023,45.5,10.5,0,-0.048756
1460,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,2023,45.2,9.4,1,-1.143417


Training size and shuffling for model predictions.

In [16]:
trainsize = int(len(df["NUM_COLLISIONS"]) * 0.9)
testsize = len(df["NUM_COLLISIONS"]) - trainsize

In [17]:
shuffledCollisions = df.sample(frac = 1).reset_index(drop = True)

In [18]:
def shuffle(value):
    return shuffledCollisions.sort_values(by=["NUM_COLLISIONS"])[value].head(testsize)

Below is the functionality used to calculate the mean absolute error.

In [10]:
SCALE_NUM_COLLISIONS = 1.0

In [12]:
def getMeanAbsoluteError(df_input):
    training_set = df_input.sample(frac = 0.8, random_state = 0)

    test_set = df_input.drop(training_set.index)

    training_features = training_set.copy()
    test_features = test_set.copy()

    training_labels = training_features.pop("NUM_COLLISIONS")
    test_labels = test_features.pop("NUM_COLLISIONS")

    training_labels = training_labels/SCALE_NUM_COLLISIONS
    test_labels = test_labels/SCALE_NUM_COLLISIONS

    if training_set.columns.size > 2:
        normaliser = tf.keras.layers.Normalization(axis = -1)
    else:
        normaliser = tf.keras.layers.Normalization(input_shape = [1,], axis = None)

    normaliser.adapt(np.array(training_features))

    dnn_model = keras.Sequential([
      normaliser,
      layers.Dense(48, activation='relu'),
      layers.Dense(48, activation='relu'),
      layers.Dense(1)
  ])

    dnn_model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))

    history = dnn_model.fit(
        training_features,
        training_labels,
        epochs = 100,
        verbose = 0,
        validation_split = 0.2)

    mean_absolute_error = dnn_model.evaluate(
        test_features,
        test_labels, verbose = 0)

    return {"model": dnn_model, "error": mean_absolute_error}

# All Data

In [13]:
dnn_input_data = [df["year"], df["temp"], df["wdsp"], df["Sat"], df["Sun"], df["Mon"], df["Tue"], df["Wed"], df["Thu"], df["Fri"], df["Jan"], df["Feb"], df["Mar"], df["Apr"], df["May"], df["Jun"], df["Jul"], df["Aug"], df["Sep"], df["Oct"], df["Nov"], df["Dec"], df["NUM_COLLISIONS"]]
headers = ["year","temp", "wdsp", "Sat","Sun","Mon","Tue","Wed","Thu","Fri","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec","NUM_COLLISIONS"]
df_dnn_input = pd.concat(dnn_input_data, axis = 1, keys = headers)

df_dnn_input.head()

Unnamed: 0,year,temp,wdsp,Sat,Sun,Mon,Tue,Wed,Thu,Fri,...,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,NUM_COLLISIONS
1,2020,40.3,11.4,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.340174
2,2020,39.6,13.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.948172
3,2020,45.8,10.3,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1.33838
4,2020,45.4,5.7,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0.403696
5,2020,40.1,17.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.15893


In [14]:
results_dnn = getMeanAbsoluteError(df_dnn_input)

In [15]:
print(results_dnn["error"])

0.6763582825660706


In [20]:
input_data = pd.DataFrame.from_dict(data =
    {
        "year" : shuffle("year"),
        "temp" : shuffle("temp"),
        "wdsp" : shuffle("wdsp"),
        "Sat" : shuffle("Sat"),
        "Sun" : shuffle("Sun"),
        "Mon" : shuffle("Mon"),
        "Tue" : shuffle("Tue"),
        "Wed" : shuffle("Wed"),
        "Thu" : shuffle("Thu"),
        "Fri" : shuffle("Fri"),
        "Jan" : shuffle("Jan"),
        "Feb" : shuffle("Feb"),
        "Mar" : shuffle("Mar"),
        "Apr" : shuffle("Apr"),
        "May" : shuffle("May"),
        "Jun" : shuffle("Jun"),
        "Jul" : shuffle("Jul"),
        "Aug" : shuffle("Aug"),
        "Sep" : shuffle("Sep"),
        "Oct" : shuffle("Oct"),
        "Nov" : shuffle("Nov"),
        "Dec" : shuffle("Dec")
    })

In [23]:
dnn_predictions = results_dnn["model"].predict(input_data) * SCALE_NUM_COLLISIONS
dnn_predictions



array([[-4.02199358e-01],
       [-1.40969491e+00],
       [-1.88335788e+00],
       [-9.70102727e-01],
       [-1.95739508e+00],
       [-1.34806943e+00],
       [-6.95010275e-03],
       [-7.28137374e-01],
       [-1.72501206e+00],
       [-9.06922758e-01],
       [-1.53608143e+00],
       [-1.37757087e+00],
       [ 5.68162575e-02],
       [-1.09419882e+00],
       [-1.00736630e+00],
       [-1.84698737e+00],
       [-2.29148245e+00],
       [-5.56222558e-01],
       [-1.91297448e+00],
       [-2.12280536e+00],
       [-2.31393671e+00],
       [-2.03780818e+00],
       [ 2.96458274e-01],
       [ 9.21587765e-01],
       [ 1.74014598e-01],
       [-4.03897583e-01],
       [-1.41765773e+00],
       [-1.32314444e+00],
       [-2.02404857e+00],
       [ 1.46066666e-01],
       [-1.54757452e+00],
       [-2.17994976e+00],
       [-1.64153457e-01],
       [-2.03958964e+00],
       [-1.41944206e+00],
       [-2.04667568e+00],
       [-1.12324513e-01],
       [-2.00364566e+00],
       [-1.8