# Join the two datasets

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

#### Load datasets:

In [2]:
df_hourly_weather = pd.read_pickle('datasets/hourly_weather.pkl')
df_accidents = pd.read_pickle("datasets/accidents.pkl")

In [3]:
df_hourly_weather.columns

Index(['date', 'air_temperature', 'water_temperature', 'wind_gust_max_10min',
       'wind_speed_avg_10min', 'wind_force_avg_10min', 'wind_direction',
       'windchill', 'barometric_pressure_qfe', 'precipitation', 'dew_point',
       'global_radiation', 'humidity', 'water_level'],
      dtype='object')

In [4]:
df_accidents.columns

Index(['date', 'AccidentType', 'AccidentSeverityCategory',
       'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
       'AccidentInvolvingMotorcycle', 'RoadType', 'AccidentLocation_CHLV95_E',
       'AccidentLocation_CHLV95_N', 'Month', 'WeekDay', 'Hour'],
      dtype='object')

**Notes**
<br>
***a) Neural Network***:
<br> 
Need to one hot encode object inputs (*normalization will be done when splitting data into train and test*).
<br>
***b) Random Forest***:
<br>
Only one hot encoding is not necessary.
<br><br>
**A: Prediction for accident yes/no**:
<br>
Join accident data on weather data.
<br>
Use only weather data to predict whether an accident happens or not.
<br><br>
**B: Prediction for multi-level**:
<br>
Join weather data on accident data.
<br> 
Could predict the probability of a given type of accident (AccidentSeverity and AccidentType) based on weather data (maybe add location)


### A: Merge and fill nans of date columns:

In [5]:
df_join = pd.merge(df_hourly_weather, df_accidents, on='date', how='left')
df_join.shape

(92081, 25)

In [6]:
df_join.dtypes

date                           datetime64[ns]
air_temperature                       float64
water_temperature                     float64
wind_gust_max_10min                   float64
wind_speed_avg_10min                  float64
wind_force_avg_10min                  float64
wind_direction                         object
windchill                             float64
barometric_pressure_qfe               float64
precipitation                         float64
dew_point                             float64
global_radiation                      float64
humidity                              float64
water_level                           float64
AccidentType                           object
AccidentSeverityCategory               object
AccidentInvolvingPedestrian           float64
AccidentInvolvingBicycle              float64
AccidentInvolvingMotorcycle           float64
RoadType                               object
AccidentLocation_CHLV95_E             float64
AccidentLocation_CHLV95_N         

Fill Month, WeekDay and Hour for weather data:

In [7]:
df_join['Month'].fillna(df_join['date'].dt.month, inplace = True)
df_join['WeekDay'].fillna(df_join['date'].dt.day_name(), inplace = True)
df_join['Hour'].fillna(df_join['date'].dt.hour, inplace = True)
df_join.tail()

Unnamed: 0,date,air_temperature,water_temperature,wind_gust_max_10min,wind_speed_avg_10min,wind_force_avg_10min,wind_direction,windchill,barometric_pressure_qfe,precipitation,...,AccidentSeverityCategory,AccidentInvolvingPedestrian,AccidentInvolvingBicycle,AccidentInvolvingMotorcycle,RoadType,AccidentLocation_CHLV95_E,AccidentLocation_CHLV95_N,Month,WeekDay,Hour
92076,2019-12-31 18:30:00,1.56,6.8,2.3,1.24,1.0,734,1.54,984.54,0.0,...,as4,0.0,0.0,0.0,rt432,2682276.0,1247052.0,12.0,Tuesday,18.0
92077,2019-12-31 19:30:00,1.34,6.8,2.6,1.44,1.2,963,1.18,984.62,0.0,...,as4,0.0,0.0,0.0,rt433,2683004.0,1247184.0,12.0,Tuesday,19.0
92078,2019-12-31 20:30:00,1.04,6.8,2.8,1.66,1.8,699,0.78,984.76,0.0,...,,,,,,,,12.0,Tuesday,20.0
92079,2019-12-31 21:30:00,1.06,6.76,2.5,1.36,1.2,729,1.0,984.84,0.0,...,,,,,,,,12.0,Tuesday,21.0
92080,2019-12-31 22:30:00,0.48,6.7,4.6,3.12,2.2,293,-3.6,985.02,0.0,...,,,,,,,,12.0,Tuesday,22.0


Convert wind direction from type *object* to type *int*:

In [8]:
df_join['wind_direction'] = [int(df_join['wind_direction'][i]) for i in range(len(df_join))]   

Add dummy for whether an accident happened:

In [9]:
df_join['Accident'] = np.where(df_join.isnull().any(axis = 1), 0, 1)

One Hot Encoding:

In [10]:
categorical_cols = list(df_join.select_dtypes(include=['object']).columns)
categorical_cols.extend(['Month', 'Hour']) # these need to be one hot encoded as well
df_join = pd.get_dummies(df_join, columns = categorical_cols, dtype= bool, drop_first= True)
df_join.columns

Index(['date', 'air_temperature', 'water_temperature', 'wind_gust_max_10min',
       'wind_speed_avg_10min', 'wind_force_avg_10min', 'wind_direction',
       'windchill', 'barometric_pressure_qfe', 'precipitation', 'dew_point',
       'global_radiation', 'humidity', 'water_level',
       'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
       'AccidentInvolvingMotorcycle', 'AccidentLocation_CHLV95_E',
       'AccidentLocation_CHLV95_N', 'Accident', 'AccidentType_at00',
       'AccidentType_at1', 'AccidentType_at2', 'AccidentType_at3',
       'AccidentType_at4', 'AccidentType_at5', 'AccidentType_at6',
       'AccidentType_at7', 'AccidentType_at8', 'AccidentType_at9',
       'AccidentSeverityCategory_as2', 'AccidentSeverityCategory_as3',
       'AccidentSeverityCategory_as4', 'RoadType_rt431', 'RoadType_rt432',
       'RoadType_rt433', 'RoadType_rt434', 'RoadType_rt439', 'WeekDay_Monday',
       'WeekDay_Saturday', 'WeekDay_Sunday', 'WeekDay_Thursday',
       'WeekDay_Tuesday',

In [14]:
df_join.head()

Unnamed: 0,date,air_temperature,water_temperature,wind_gust_max_10min,wind_speed_avg_10min,wind_force_avg_10min,wind_direction,windchill,barometric_pressure_qfe,precipitation,...,Hour_14.0,Hour_15.0,Hour_16.0,Hour_17.0,Hour_18.0,Hour_19.0,Hour_20.0,Hour_21.0,Hour_22.0,Hour_23.0
0,2011-01-01 00:30:00,2.233333,5.2,2.4,1.216667,1.216667,1785,2.2,974.55,0.0,...,False,False,False,False,False,False,False,False,False,False
1,2011-01-01 01:30:00,2.38,5.2,2.8,0.86,0.86,1076,2.16,973.98,0.0,...,False,False,False,False,False,False,False,False,False,False
2,2011-01-01 02:30:00,2.58,5.14,1.2,0.34,0.34,1159,2.58,973.64,0.0,...,False,False,False,False,False,False,False,False,False,False
3,2011-01-01 02:30:00,2.58,5.14,1.2,0.34,0.34,1159,2.58,973.64,0.0,...,False,False,False,False,False,False,False,False,False,False
4,2011-01-01 03:30:00,2.5,5.16,1.9,0.52,0.52,1122,2.54,973.42,0.0,...,False,False,False,False,False,False,False,False,False,False


In [15]:
df_join.dtypes

date                    datetime64[ns]
air_temperature                float64
water_temperature              float64
wind_gust_max_10min            float64
wind_speed_avg_10min           float64
                             ...      
Hour_19.0                         bool
Hour_20.0                         bool
Hour_21.0                         bool
Hour_22.0                         bool
Hour_23.0                         bool
Length: 78, dtype: object

Convert boolean types from True/False to 1/0:

In [17]:
bool_cols = list(df_join.select_dtypes(include=['bool']).columns)
df_join[bool_cols] = 1*df_join[bool_cols]

Remove accident type specific columns:

In [18]:
#remove accident type specific columns:
def remove_cols(df):
    
    acc_spec_cols = [df[column].name for column in df if df[column].name.startswith('Accident') and df[column].name != 'Accident']
    acc_spec_cols.extend([df[column].name for column in df if df[column].name.startswith('RoadType')])
    df = df.drop(acc_spec_cols, axis = 1)
    
    return df

In [19]:
df_join = remove_cols(df_join)

In [20]:
df_join.shape

(92081, 55)

In [21]:
df_join.columns

Index(['date', 'air_temperature', 'water_temperature', 'wind_gust_max_10min',
       'wind_speed_avg_10min', 'wind_force_avg_10min', 'wind_direction',
       'windchill', 'barometric_pressure_qfe', 'precipitation', 'dew_point',
       'global_radiation', 'humidity', 'water_level', 'Accident',
       'WeekDay_Monday', 'WeekDay_Saturday', 'WeekDay_Sunday',
       'WeekDay_Thursday', 'WeekDay_Tuesday', 'WeekDay_Wednesday', 'Month_2.0',
       'Month_3.0', 'Month_4.0', 'Month_5.0', 'Month_6.0', 'Month_7.0',
       'Month_8.0', 'Month_9.0', 'Month_10.0', 'Month_11.0', 'Month_12.0',
       'Hour_1.0', 'Hour_2.0', 'Hour_3.0', 'Hour_4.0', 'Hour_5.0', 'Hour_6.0',
       'Hour_7.0', 'Hour_8.0', 'Hour_9.0', 'Hour_10.0', 'Hour_11.0',
       'Hour_12.0', 'Hour_13.0', 'Hour_14.0', 'Hour_15.0', 'Hour_16.0',
       'Hour_17.0', 'Hour_18.0', 'Hour_19.0', 'Hour_20.0', 'Hour_21.0',
       'Hour_22.0', 'Hour_23.0'],
      dtype='object')

In [None]:
df_join.to_csv('datasets\\dataset_acc_pred.csv')

#### Trying out if neural network works

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
import keras
import tensorflow
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD

In [None]:
Y = np.array(df_join['Accident'])
X = np.array(df_join.drop(['Accident', 'date'], 1))
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, Y, test_size=0.2, random_state=2)
scaler = MinMaxScaler()
scaler.fit(X_train_80)
X_train_80 = scaler.transform(X_train_80)
X_test_20 = scaler.transform(X_test_20)

#training and evaluating neural network
NeuralNet = Sequential([
        Dense(16, input_dim= X_train_80.shape[1], activation='sigmoid'),
        Dense(4, activation = 'sigmoid'),
        Dense(1, activation='sigmoid'),
])
NeuralNet.compile(loss ='MSE', optimizer ='adam', metrics = ['accuracy'])
historyNet = NeuralNet.fit(X_train_80, y_train_80, validation_split = 0.2, epochs=100, batch_size=64, 
                              shuffle = False, verbose = 1)
scores = NeuralNet.evaluate(X_test_20, y_test_20, verbose=1)
print("%s: %.2f%%" % (NeuralNet.metrics_names[1], scores[1]*100))
plt.plot(historyNet.history['loss'])
plt.plot(historyNet.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()