In [219]:
# Imports
import pandas as pd
import numpy as np

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Finn-Anderson/BscHons-Data-Analytics/main/collinearregressiondata.csv', index_col=0, )

In [3]:
print(df)

      day  temp  wdsp  fog  NUM_COLLISIONS
3092    3  40.3  11.4    0        0.340174
2810    4  39.6  13.0    0        0.948172
1776    5  45.8  10.3    0        1.338380
341     6  45.4   5.7    1        0.403696
4034    7  40.1  17.0    1       -0.158930
...   ...   ...   ...  ...             ...
20      3  45.9   4.0    1       -0.955761
3210    4  52.1  10.9    1        0.264004
3573    5  45.5  10.5    0       -0.048756
2694    6  45.2   9.4    1       -1.143417
1456    7  38.8   8.9    0       -2.457010

[1461 rows x 5 columns]


Training size and shuffling for model predictions.

In [153]:
trainsize = int(len(df["NUM_COLLISIONS"]) * 0.9)
testsize = len(df["NUM_COLLISIONS"]) - trainsize

In [200]:
shuffledCollisions = df.sample(frac = 1).reset_index(drop = True)

In [209]:
def shuffle(value):
    return shuffledCollisions.sort_values(by=["NUM_COLLISIONS"])[value].head(testsize)

Below is the functionality used to calculate the mean absolute error.

In [4]:
SCALE_NUM_COLLISIONS = 1.0

In [81]:
def getMeanAbsoluteError(df_input):
    training_set = df_input.sample(frac = 0.8, random_state = 0)

    test_set = df_input.drop(training_set.index)

    training_features = training_set.copy()
    test_features = test_set.copy()

    training_labels = training_features.pop("NUM_COLLISIONS")
    test_labels = test_features.pop("NUM_COLLISIONS")

    training_labels = training_labels/SCALE_NUM_COLLISIONS
    test_labels = test_labels/SCALE_NUM_COLLISIONS

    if training_set.columns.size > 2:
        normaliser = tf.keras.layers.Normalization(axis = -1)
    else:
        normaliser = tf.keras.layers.Normalization(input_shape = [1,], axis = None)

    normaliser.adapt(np.array(training_features))

    model = tf.keras.Sequential([
        normaliser,
        layers.Dense(units = 1)
    ])

    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate = 0.1),
        loss='mean_absolute_error')

    history = model.fit(
        training_features,
        training_labels,
        epochs = 100,
        verbose = 0,
        validation_split = 0.2)

    mean_absolute_error = model.evaluate(
        test_features,
        test_labels, verbose = 0)

    return {"model": model, "error": mean_absolute_error}

# Day and NUM_COLLISIONS

In [72]:
df_input_data_day = [df["day"], df["NUM_COLLISIONS"]]

df_input_headers_day = ["day", "NUM_COLLISIONS"]

df_input_day = pd.concat(df_input_data_day, axis = 1, keys = df_input_headers_day)

In [82]:
results_day = getMeanAbsoluteError(df_input_day)

In [87]:
print(results_day["error"])

0.7597689032554626


In [234]:
input_day = pd.DataFrame.from_dict(data =
    {
        "day" : shuffle("day")
    })

In [235]:
linear_day_predictions_all = results_day["model"].predict(input_day) * SCALE_NUM_COLLISIONS
print(linear_day_predictions_all)

[[-0.08983642]
 [ 0.114518  ]
 [ 0.114518  ]
 [-0.08983642]
 [ 0.01234079]
 [ 0.114518  ]
 [-0.08983642]
 [ 0.08045893]
 [ 0.114518  ]
 [ 0.114518  ]
 [ 0.114518  ]
 [-0.08983642]
 [ 0.114518  ]
 [-0.05577735]
 [ 0.08045893]
 [ 0.114518  ]
 [-0.08983642]
 [ 0.01234079]
 [-0.02171828]
 [-0.08983642]
 [ 0.114518  ]
 [ 0.114518  ]
 [-0.08983642]
 [ 0.04639985]
 [ 0.01234079]
 [-0.05577735]
 [-0.08983642]
 [ 0.114518  ]
 [-0.08983642]
 [-0.08983642]
 [ 0.114518  ]
 [ 0.08045893]
 [ 0.01234079]
 [ 0.114518  ]
 [ 0.114518  ]
 [ 0.114518  ]
 [-0.05577735]
 [ 0.114518  ]
 [ 0.01234079]
 [ 0.114518  ]
 [-0.02171828]
 [ 0.114518  ]
 [ 0.114518  ]
 [-0.08983642]
 [ 0.01234079]
 [ 0.01234079]
 [ 0.01234079]
 [ 0.114518  ]
 [ 0.114518  ]
 [-0.02171828]
 [ 0.08045893]
 [ 0.04639985]
 [-0.05577735]
 [ 0.114518  ]
 [ 0.114518  ]
 [-0.05577735]
 [ 0.114518  ]
 [ 0.04639985]
 [ 0.08045893]
 [-0.05577735]
 [ 0.01234079]
 [ 0.08045893]
 [ 0.08045893]
 [-0.02171828]
 [-0.08983642]
 [ 0.08045893]
 [-0.02171

# Temp, Wind Speed and NUM_COLLISIONS

In [242]:
df_input_data_tw = [df["temp"], df["wdsp"], df["NUM_COLLISIONS"]]

df_input_headers_tw = ["temp", "wdsp", "NUM_COLLISIONS"]

df_input_tw = pd.concat(df_input_data_tw, axis = 1, keys = df_input_headers_tw)

In [243]:
results_tw = getMeanAbsoluteError(df_input_tw)

In [246]:
print(results_tw["error"])

0.7493680119514465


In [247]:
input_tw = pd.DataFrame.from_dict(data =
    {
        "temp" : shuffle("temp"),
        "wdsp" : shuffle("wdsp")
    })

In [248]:
linear_day_predictions_tw = results_tw["model"].predict(input_tw) * SCALE_NUM_COLLISIONS
print(linear_day_predictions_tw)

[[-0.19407144]
 [-0.142127  ]
 [ 0.05806036]
 [-0.23859891]
 [-0.2273843 ]
 [-0.04369778]
 [ 0.10598078]
 [-0.2296282 ]
 [-0.20810688]
 [ 0.03365911]
 [-0.12001393]
 [ 0.05228083]
 [-0.20136046]
 [-0.25452274]
 [-0.2561946 ]
 [-0.18326649]
 [ 0.42237142]
 [-0.14943072]
 [-0.33361056]
 [ 0.08566261]
 [ 0.4981401 ]
 [ 0.27940482]
 [ 0.19684044]
 [-0.03237984]
 [ 0.03702249]
 [-0.14823255]
 [-0.07196559]
 [-0.10748304]
 [-0.22168845]
 [-0.04216371]
 [ 0.26024556]
 [-0.3713522 ]
 [ 0.00355213]
 [-0.28217918]
 [-0.2804827 ]
 [ 0.07051251]
 [ 0.07208107]
 [ 0.09806065]
 [-0.04713991]
 [-0.03587611]
 [-0.29401994]
 [-0.38661546]
 [-0.11722257]
 [-0.17372373]
 [-0.03529914]
 [ 0.07321038]
 [-0.08664197]
 [-0.04765785]
 [-0.10405072]
 [ 0.11505974]
 [ 0.15284565]
 [ 0.28112096]
 [ 0.08730004]
 [-0.32564127]
 [-0.27932876]
 [-0.20817089]
 [-0.21324056]
 [-0.05509941]
 [ 0.12348792]
 [-0.04600563]
 [-0.2426672 ]
 [-0.02394674]
 [-0.15895382]
 [-0.32567078]
 [-0.03358298]
 [ 0.07661318]
 [-0.21327

# Temp, Fog and NUM_COLLISIONS

In [249]:
df_input_data_tf = [df["temp"], df["fog"], df["NUM_COLLISIONS"]]

df_input_headers_tf = ["temp", "fog", "NUM_COLLISIONS"]

df_input_tf = pd.concat(df_input_data_tf, axis = 1, keys = df_input_headers_tf)

In [250]:
results_tf = getMeanAbsoluteError(df_input_tf)

In [251]:
print(results_tf["error"])

0.7204829454421997


In [252]:
input_tf = pd.DataFrame.from_dict(data =
    {
        "temp" : shuffle("temp"),
        "fog" : shuffle("fog")
    })

In [253]:
linear_day_predictions_tf = results_tf["model"].predict(input_tf) * SCALE_NUM_COLLISIONS
print(linear_day_predictions_tf)

[[-0.20722173]
 [ 0.0929547 ]
 [-0.58929557]
 [-0.5189135 ]
 [-0.2735819 ]
 [-0.43244419]
 [-0.28564736]
 [-0.31983292]
 [-0.4646188 ]
 [-0.09406033]
 [-0.3218439 ]
 [-0.0256892 ]
 [-0.36809492]
 [-0.02971106]
 [-0.2896692 ]
 [-0.40630224]
 [-0.22934176]
 [-0.21928717]
 [-0.26553822]
 [-0.15639867]
 [-0.39021492]
 [ 0.01251811]
 [ 0.02403343]
 [-0.48271704]
 [-0.21124357]
 [-0.41434592]
 [-0.3841822 ]
 [-0.6094047 ]
 [-0.40429133]
 [ 0.02805528]
 [-0.32787663]
 [-0.59331733]
 [ 0.17284107]
 [-0.30776745]
 [-0.3117893 ]
 [-0.65163386]
 [-0.09607121]
 [-0.2574946 ]
 [-0.09003849]
 [ 0.01452908]
 [-0.37814948]
 [-0.5711973 ]
 [-0.22531992]
 [-0.22733086]
 [-0.21325445]
 [-0.2876583 ]
 [-0.42440048]
 [-0.41836774]
 [-0.73005956]
 [-0.39624766]
 [-0.00155822]
 [-0.2414073 ]
 [-0.18510163]
 [-0.43445507]
 [-0.38016036]
 [-0.2816256 ]
 [-0.32586566]
 [ 0.05072552]
 [-0.26151645]
 [-0.29570192]
 [-0.19515619]
 [-0.36206216]
 [-0.4545642 ]
 [-0.34999666]
 [-0.2735819 ]
 [-0.32385477]
 [-0.21325

# Wind Speed, Fog and NUM_COLLISIONS

In [254]:
df_input_data_wf = [df["wdsp"], df["fog"], df["NUM_COLLISIONS"]]

df_input_headers_wf = ["wdsp", "fog", "NUM_COLLISIONS"]

df_input_wf = pd.concat(df_input_data_wf, axis = 1, keys = df_input_headers_wf)

In [255]:
results_wf = getMeanAbsoluteError(df_input_wf)

In [256]:
print(results_wf["error"])

0.7846194505691528


In [257]:
input_wf = pd.DataFrame.from_dict(data =
    {
        "wdsp" : shuffle("wdsp"),
        "fog" : shuffle("fog")
    })

In [258]:
linear_day_predictions_wf = results_wf["model"].predict(input_wf) * SCALE_NUM_COLLISIONS
print(linear_day_predictions_wf)

[[ 0.29342547]
 [ 0.0315648 ]
 [-0.4590628 ]
 [ 0.08298384]
 [ 0.28704846]
 [-0.15296586]
 [-0.26137516]
 [ 0.24878637]
 [ 0.08298384]
 [-0.4212036 ]
 [ 0.07022981]
 [-0.38931847]
 [ 0.15950812]
 [ 0.10171199]
 [ 0.31893355]
 [ 0.09573793]
 [-0.7205206 ]
 [ 0.21052425]
 [ 0.46560502]
 [-0.561498  ]
 [-0.9883553 ]
 [-0.7209235 ]
 [-0.12745775]
 [-0.21673606]
 [-0.08281863]
 [ 0.03196774]
 [-0.06368755]
 [-0.21035899]
 [ 0.15950812]
 [ 0.26154038]
 [-0.54834104]
 [ 0.22965528]
 [ 0.31893355]
 [ 0.34444165]
 [ 0.33806464]
 [-0.53558695]
 [-0.4849738 ]
 [-0.22311305]
 [-0.28728622]
 [-0.21076192]
 [ 0.2998025 ]
 [ 0.27429444]
 [ 0.15313108]
 [ 0.24240936]
 [ 0.03196774]
 [-0.21035899]
 [-0.07644165]
 [-0.1338348 ]
 [-0.3251454 ]
 [-0.3761615 ]
 [-0.5296129 ]
 [-0.50370187]
 [-0.14021184]
 [ 0.2998025 ]
 [ 0.27429444]
 [ 0.24878637]
 [ 0.21690129]
 [-0.14699173]
 [-0.26775223]
 [-0.02542548]
 [ 0.38270375]
 [-0.12108077]
 [ 0.01283666]
 [ 0.37632674]
 [-0.02542548]
 [-0.24862112]
 [ 0.31893

# Day, Temp, Wind Speed, Fog and NUM_COLLISIONS

In [74]:
df_input_data = [df["day"], df["temp"], df["wdsp"], df["fog"], df["NUM_COLLISIONS"]]

df_input_headers = ["day", "temp", "wdsp", "fog", "NUM_COLLISIONS"]

df_input_all = pd.concat(df_input_data, axis = 1, keys = df_input_headers)

In [173]:
results_all = getMeanAbsoluteError(df_input_all)

In [89]:
print(results_all["error"])

0.7472981810569763


In [221]:
input_all = pd.DataFrame.from_dict(data =
    {
        "day" : shuffle("day"),
        "temp" : shuffle("temp"),
        "wdsp" : shuffle("wdsp"),
        "fog" : shuffle("fog")
    })

In [233]:
linear_day_predictions_all = results_all["model"].predict(input_all) * SCALE_NUM_COLLISIONS
print(linear_day_predictions_all)

[[-0.14283702]
 [-0.22476715]
 [-0.61715895]
 [-0.3953166 ]
 [-0.30834636]
 [-0.5106648 ]
 [-0.15482432]
 [-0.41696137]
 [-0.56319004]
 [-0.34272313]
 [-0.43632847]
 [-0.07025591]
 [-0.48616898]
 [-0.16015756]
 [-0.3976404 ]
 [-0.5132181 ]
 [-0.05815699]
 [-0.25274232]
 [-0.28367424]
 [-0.16751492]
 [-0.38773572]
 [-0.21822041]
 [ 0.10374591]
 [-0.47645116]
 [-0.21554287]
 [-0.3340603 ]
 [-0.26177514]
 [-0.660384  ]
 [-0.30238056]
 [ 0.0673327 ]
 [-0.3781066 ]
 [-0.6554901 ]
 [ 0.08094925]
 [-0.45211154]
 [-0.45499325]
 [-0.66411823]
 [-0.15826029]
 [-0.34961563]
 [-0.24513052]
 [-0.26884532]
 [-0.36567318]
 [-0.67655927]
 [-0.35996228]
 [-0.15528104]
 [-0.22909968]
 [-0.26964122]
 [-0.40364122]
 [-0.5002512 ]
 [-0.75469553]
 [-0.31216705]
 [-0.21430978]
 [-0.23477983]
 [-0.1147874 ]
 [-0.5589322 ]
 [-0.50856745]
 [-0.2396174 ]
 [-0.45492816]
 [-0.17169169]
 [-0.3126315 ]
 [-0.22383428]
 [-0.24920502]
 [-0.41611123]
 [-0.5112079 ]
 [-0.34877545]
 [-0.16844623]
 [-0.36941457]
 [-0.22263