# Join the two datasets

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

#### Load datasets:

In [2]:
df_hourly_weather = pd.read_pickle('datasets/hourly_weather.pkl')
df_accidents = pd.read_pickle("datasets/accidents.pkl")

**Check NaN again:**

In [3]:
idx, idy = np.where(pd.isnull(df_accidents))
idx, df_accidents.columns[idy]

(array([], dtype=int64), Index([], dtype='object'))

**Compare dims of the datasets:**

In [4]:
df_hourly_weather.shape

(78707, 14)

In [5]:
df_accidents.shape

(42726, 13)

**Check which columns are in the dfs:**

In [6]:
print('Weather dataset:', list(df_hourly_weather.columns))
print(' ')
print('Accidents dataset:', list(df_accidents.columns))

Weather dataset: ['date', 'air_temperature', 'water_temperature', 'wind_gust_max_10min', 'wind_speed_avg_10min', 'wind_force_avg_10min', 'wind_direction', 'windchill', 'barometric_pressure_qfe', 'precipitation', 'dew_point', 'global_radiation', 'humidity', 'water_level']
 
Accidents dataset: ['date', 'AccidentType', 'AccidentSeverityCategory', 'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle', 'AccidentInvolvingMotorcycle', 'RoadType', 'AccidentLocation_CHLV95_E', 'AccidentLocation_CHLV95_N', 'Year', 'Month', 'WeekDay', 'Hour']


**Notes**
<br>
***Neural Network***:
<br> 
Need to one hot encode object inputs (*normalization will be done when splitting data into train and test*).
<br>
***Random Forest***:
<br>
Only one hot encoding is necessary.
<br><br>
**A: Prediction for accident yes/no**:
<br>
Join accident data on weather data.
<br>
Use only weather data to predict whether an accident happens or not.
<br><br>
**B: Prediction for multi-level**:
<br>
Join accident data on weather data.
<br> 
Could predict the probability of a given type of accident (AccidentSeverity and AccidentType) based on weather data (maybe add location)


### A: Merge and fill nans of date columns:

In [7]:
df_join = pd.merge(df_hourly_weather, df_accidents, on='date', how='left')
df_join.shape

(92081, 26)

In [8]:
df_join.isnull().any()

date                           False
air_temperature                False
water_temperature              False
wind_gust_max_10min            False
wind_speed_avg_10min           False
wind_force_avg_10min           False
wind_direction                 False
windchill                      False
barometric_pressure_qfe        False
precipitation                  False
dew_point                      False
global_radiation               False
humidity                       False
water_level                    False
AccidentType                    True
AccidentSeverityCategory        True
AccidentInvolvingPedestrian     True
AccidentInvolvingBicycle        True
AccidentInvolvingMotorcycle     True
RoadType                        True
AccidentLocation_CHLV95_E       True
AccidentLocation_CHLV95_N       True
Year                            True
Month                           True
WeekDay                         True
Hour                            True
dtype: bool

In [9]:
df_join.dtypes

date                           datetime64[ns]
air_temperature                       float64
water_temperature                     float64
wind_gust_max_10min                   float64
wind_speed_avg_10min                  float64
wind_force_avg_10min                  float64
wind_direction                         object
windchill                             float64
barometric_pressure_qfe               float64
precipitation                         float64
dew_point                             float64
global_radiation                      float64
humidity                              float64
water_level                           float64
AccidentType                           object
AccidentSeverityCategory               object
AccidentInvolvingPedestrian           float64
AccidentInvolvingBicycle              float64
AccidentInvolvingMotorcycle           float64
RoadType                               object
AccidentLocation_CHLV95_E             float64
AccidentLocation_CHLV95_N         

- Fill Month, WeekDay and Hour for weather data:

In [10]:
df_join['Month'].fillna(df_join['date'].dt.month, inplace = True)
df_join['WeekDay'].fillna(df_join['date'].dt.day_name(), inplace = True)
df_join['Hour'].fillna(df_join['date'].dt.hour, inplace = True)
df_join.tail()

Unnamed: 0,date,air_temperature,water_temperature,wind_gust_max_10min,wind_speed_avg_10min,wind_force_avg_10min,wind_direction,windchill,barometric_pressure_qfe,precipitation,...,AccidentInvolvingPedestrian,AccidentInvolvingBicycle,AccidentInvolvingMotorcycle,RoadType,AccidentLocation_CHLV95_E,AccidentLocation_CHLV95_N,Year,Month,WeekDay,Hour
92076,2019-12-31 18:30:00,1.56,6.8,2.3,1.24,1.0,734,1.54,984.54,0.0,...,0.0,0.0,0.0,rt432,2682276.0,1247052.0,2019.0,12.0,Tuesday,18.0
92077,2019-12-31 19:30:00,1.34,6.8,2.6,1.44,1.2,963,1.18,984.62,0.0,...,0.0,0.0,0.0,rt433,2683004.0,1247184.0,2019.0,12.0,Tuesday,19.0
92078,2019-12-31 20:30:00,1.04,6.8,2.8,1.66,1.8,699,0.78,984.76,0.0,...,,,,,,,,12.0,Tuesday,20.0
92079,2019-12-31 21:30:00,1.06,6.76,2.5,1.36,1.2,729,1.0,984.84,0.0,...,,,,,,,,12.0,Tuesday,21.0
92080,2019-12-31 22:30:00,0.48,6.7,4.6,3.12,2.2,293,-3.6,985.02,0.0,...,,,,,,,,12.0,Tuesday,22.0


- Convert wind direction from type *object* to type *int*:

In [11]:
df_join['wind_direction'] = [int(i) for i in df_join['wind_direction']]   

- Add dummy for whether an accident happened:

In [12]:
df_join.dtypes

date                           datetime64[ns]
air_temperature                       float64
water_temperature                     float64
wind_gust_max_10min                   float64
wind_speed_avg_10min                  float64
wind_force_avg_10min                  float64
wind_direction                          int64
windchill                             float64
barometric_pressure_qfe               float64
precipitation                         float64
dew_point                             float64
global_radiation                      float64
humidity                              float64
water_level                           float64
AccidentType                           object
AccidentSeverityCategory               object
AccidentInvolvingPedestrian           float64
AccidentInvolvingBicycle              float64
AccidentInvolvingMotorcycle           float64
RoadType                               object
AccidentLocation_CHLV95_E             float64
AccidentLocation_CHLV95_N         

In [13]:
df_join['Accident'] = np.where(df_join.isnull().any(axis = 1), 0, 1)

- One Hot Encoding:

In [14]:
categorical_cols = list(df_join.select_dtypes(include=['object']).columns)
categorical_cols.extend(['Month', 'Hour']) # these need to be one hot encoded as well
df_join = pd.get_dummies(df_join, columns = categorical_cols, dtype= bool, drop_first= True)
df_join.columns

Index(['date', 'air_temperature', 'water_temperature', 'wind_gust_max_10min',
       'wind_speed_avg_10min', 'wind_force_avg_10min', 'wind_direction',
       'windchill', 'barometric_pressure_qfe', 'precipitation', 'dew_point',
       'global_radiation', 'humidity', 'water_level',
       'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
       'AccidentInvolvingMotorcycle', 'AccidentLocation_CHLV95_E',
       'AccidentLocation_CHLV95_N', 'Year', 'Accident', 'AccidentType_at00',
       'AccidentType_at1', 'AccidentType_at2', 'AccidentType_at3',
       'AccidentType_at4', 'AccidentType_at5', 'AccidentType_at6',
       'AccidentType_at7', 'AccidentType_at8', 'AccidentType_at9',
       'AccidentSeverityCategory_as2', 'AccidentSeverityCategory_as3',
       'AccidentSeverityCategory_as4', 'RoadType_rt431', 'RoadType_rt432',
       'RoadType_rt433', 'RoadType_rt434', 'RoadType_rt439', 'WeekDay_Monday',
       'WeekDay_Saturday', 'WeekDay_Sunday', 'WeekDay_Thursday',
       'WeekDay_T

In [15]:
df_join['AccidentType_at00'].tail()

92076    False
92077    False
92078    False
92079    False
92080    False
Name: AccidentType_at00, dtype: bool

In [16]:
df_join.dtypes

date                    datetime64[ns]
air_temperature                float64
water_temperature              float64
wind_gust_max_10min            float64
wind_speed_avg_10min           float64
                             ...      
Hour_19.0                         bool
Hour_20.0                         bool
Hour_21.0                         bool
Hour_22.0                         bool
Hour_23.0                         bool
Length: 79, dtype: object

- Convert boolean types from True/False to 1/0:

In [17]:
bool_cols = list(df_join.select_dtypes(include=['bool']).columns)
df_join[bool_cols] = 1*df_join[bool_cols]

- Remove accident type specific columns:

In [18]:
#remove accident type specific columns:
acc_spec_cols = [df_join[column].name for column in df_join if df_join[column].name.startswith('AccidentLocation')]
acc_spec_cols.extend([df_join[column].name for column in df_join if df_join[column].name.startswith('RoadType')])
df_join = df_join.drop(acc_spec_cols, axis = 1)

In [19]:
df_join.shape

(92081, 72)

In [20]:
df_join.columns

Index(['date', 'air_temperature', 'water_temperature', 'wind_gust_max_10min',
       'wind_speed_avg_10min', 'wind_force_avg_10min', 'wind_direction',
       'windchill', 'barometric_pressure_qfe', 'precipitation', 'dew_point',
       'global_radiation', 'humidity', 'water_level',
       'AccidentInvolvingPedestrian', 'AccidentInvolvingBicycle',
       'AccidentInvolvingMotorcycle', 'Year', 'Accident', 'AccidentType_at00',
       'AccidentType_at1', 'AccidentType_at2', 'AccidentType_at3',
       'AccidentType_at4', 'AccidentType_at5', 'AccidentType_at6',
       'AccidentType_at7', 'AccidentType_at8', 'AccidentType_at9',
       'AccidentSeverityCategory_as2', 'AccidentSeverityCategory_as3',
       'AccidentSeverityCategory_as4', 'WeekDay_Monday', 'WeekDay_Saturday',
       'WeekDay_Sunday', 'WeekDay_Thursday', 'WeekDay_Tuesday',
       'WeekDay_Wednesday', 'Month_2.0', 'Month_3.0', 'Month_4.0', 'Month_5.0',
       'Month_6.0', 'Month_7.0', 'Month_8.0', 'Month_9.0', 'Month_10.0',
       

In [21]:
df_join['AccidentType_at00'].isnull().any()

False

In [22]:
df_join.to_csv('C:/Users/anton/dataset.csv')

# Preprocessing in OpenRefine

The following changes were made in OpenRefine as part of the preprocessing and documented now here in the notebook:
- Removed unnecessary columns (3 languages, days, moths, hours, column number, etc.)
- Transformed date column ToDate
- Removed AccidentUID - Ran Duplicates facet on it --> no duplicates (false 48017)
- Removed CantonCode because everything from canton ZH
- Removed MuicipalityCode - Always 261
- Remove rows after 2019 because weather doesn't have more entries
- air_temp to numeric value --> numeric facet --> removed all blank rows
- weather values to numeric values