In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# This dataframe is the output of the notebook 'TorontoCollisions_EDA', which 
# shows the collisions from 2000 to 2019 in INSIGHT ('Church-Yonge Corridor (75)') neighborhood.
geo_dfE_I = pd.read_csv("../input/geo_dfE_I.csv")
geo_dfE_I.head()

In [None]:
geo_dfE_I[['# involved', '# injured', '# fatalities', '# KSI']].describe()

In [None]:
# Percentage of '# involved' in each collision
inv_per = geo_dfE_I['# involved'].value_counts(normalize=True) * 100
inv_per = pd.DataFrame({'percentage':inv_per})
inv_per

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Bar(
    x=inv_per.index, y=inv_per['percentage'],
    text=inv_per['percentage'],
    textposition='outside',
    texttemplate='%{text:.2f}'+'%')])
fig.update_layout(title='',
                  xaxis=dict(
                      title='Number of people involved in each collision',
                      tickmode = 'linear',
                      tick0 = 1,
                      dtick = 1), 
                  yaxis=dict(
                      title='Percentage',
                      tickmode = 'array',
                      tickvals = [0]))
fig.show()

In [None]:
# Creata a new dataframe for doing regression prediction
df_reg = geo_dfE_I[['acc_date', 'road_surface_cond', 'visibility', 'light', '# involved']]

df_reg['acc_date'] = pd.to_datetime(df_reg['acc_date'], errors = 'raise')
df_reg['year']=df_reg['acc_date'].dt.year
df_reg['month']=df_reg['acc_date'].dt.month
#import calendar
# df_reg['month_name'] = df_reg['month'].apply(lambda x: calendar.month_abbr[x])
df_reg['month_name'] = df_reg['acc_date'].dt.month_name()
df_reg['day_of_week'] = df_reg['acc_date'].dt.day_name()

df_reg.drop('acc_date', axis=1, inplace=True)
df_reg

# one-hot encoding
**We need to one-hot encode our categorical features: 'road_surface_cond', 'visibility', 'light', and 'month'.**

In [None]:
list(df_reg['road_surface_cond'].unique())

In [None]:
list(df_reg['visibility'].unique())

In [None]:
list(df_reg['light'].unique())

In [None]:
# ohe stands for one-hot encoding
ohe_fields=['road_surface_cond','visibility','light','month', 'day_of_week']

# One-Hot encode a couple of variables
df_reg = pd.get_dummies(df_reg, columns=ohe_fields, prefix=ohe_fields)
df_reg

In [None]:
# Set the train and test sets
df_reg_2018 = df_reg[df_reg['year']==2018]  # test
df_reg_other = df_reg[df_reg['year']!=(2018 & 2019)]  # train

In [None]:
df_reg.columns

In [None]:
X_train = df_reg_other[['road_surface_cond_DRY',
       'road_surface_cond_ICE', 'road_surface_cond_LOOSE SAND OR GRAVEL',
       'road_surface_cond_LOOSE SNOW', 'road_surface_cond_OTHER',
       'road_surface_cond_PACKED SNOW', 'road_surface_cond_SLUSH',
       'road_surface_cond_SPILLED LIQUID', 'road_surface_cond_WET',
       'visibility_CLEAR', 'visibility_DRIFTING SNOW',
       'visibility_FOG, MIST, SMOKE, DUST', 'visibility_FREEZING RAIN',
       'visibility_OTHER', 'visibility_RAIN', 'visibility_SNOW',
       'visibility_STRONG WIND', 'light_DARK', 'light_DARK, ARTIFICIAL',
       'light_DAWN', 'light_DAWN, ARTIFICIAL', 'light_DAYLIGHT',
       'light_DAYLIGHT, ARTIFICIAL', 'light_DUSK', 'light_DUSK, ARTIFICIAL',
       'light_OTHER', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'day_of_week_Friday', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday']].values

Y_train = df_reg_other['# involved'].values

X_test = df_reg_2018[['road_surface_cond_DRY',
       'road_surface_cond_ICE', 'road_surface_cond_LOOSE SAND OR GRAVEL',
       'road_surface_cond_LOOSE SNOW', 'road_surface_cond_OTHER',
       'road_surface_cond_PACKED SNOW', 'road_surface_cond_SLUSH',
       'road_surface_cond_SPILLED LIQUID', 'road_surface_cond_WET',
       'visibility_CLEAR', 'visibility_DRIFTING SNOW',
       'visibility_FOG, MIST, SMOKE, DUST', 'visibility_FREEZING RAIN',
       'visibility_OTHER', 'visibility_RAIN', 'visibility_SNOW',
       'visibility_STRONG WIND', 'light_DARK', 'light_DARK, ARTIFICIAL',
       'light_DAWN', 'light_DAWN, ARTIFICIAL', 'light_DAYLIGHT',
       'light_DAYLIGHT, ARTIFICIAL', 'light_DUSK', 'light_DUSK, ARTIFICIAL',
       'light_OTHER', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'day_of_week_Friday', 'day_of_week_Monday',
       'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday',
       'day_of_week_Tuesday', 'day_of_week_Wednesday']].values

Y_test = df_reg_2018['# involved'].values

In [None]:
# Normalize Data
# Data Standardization give data zero mean and unit variance, it is good practice, especially for algorithms such as KNN which is based on distance of cases:
from sklearn import preprocessing
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float))
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test.astype(float))

In [None]:
print ('Train set:', X_train.shape,  Y_train.shape)
print ('Test set:', X_test.shape,  Y_test.shape)

In [None]:
# # Multiple Linear Regression
# from sklearn import linear_model
# regr = linear_model.LinearRegression()
# regr.fit(X_train, Y_train)
# # The coefficients
# print ('Coefficients: ', regr.coef_)

In [None]:
# Y_hat= regr.predict(X_test)
# print("Mean squared error (MSE): %.2f"
#       % np.mean((Y_hat - Y_test) ** 2))

# # Explained variance score: 1 is perfect prediction
# print('Variance score: %.2f' % regr.score(X_test, Y_test))

In [None]:
# Polynomial regression

# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.metrics import r2_score

# poly = PolynomialFeatures(degree=5)
# X_train_poly = poly.fit_transform(X_train)

# clf = linear_model.LinearRegression()
# Y_hat_train = clf.fit(X_train_poly, Y_train)

# The coefficients
# print ('Coefficients: ', clf.coef_)
# print ('Intercept: ', clf.intercept_)

# X_test_poly = poly.fit_transform(X_test)
# Y_hat_test = clf.predict(X_test_poly)
# print("Mean absolute error: %.2f" % np.mean(np.absolute(Y_hat_test - Y_test)))
# print("Mean squared error (MSE): %.2f" % np.mean((Y_hat_test - Y_test) ** 2))
# print("R2-score: %.2f" % r2_score(Y_hat_test, Y_test))

**Evaluation**

we compare the actual values and predicted values to calculate the accuracy of a regression model. Evaluation metrics provide a key role in the development of a model, as it provides insight to areas that require improvement.
There are different model evaluation metrics, lets use MSE here to calculate the accuracy of our model based on the test set:
- **Mean absolute error**: It is the mean of the absolute value of the errors. This is the easiest of the metrics to understand since it’s just average error.
- **Mean Squared Error (MSE)**: It is the mean of the squared error. It’s more popular than Mean absolute error because the focus is geared more towards large errors. This is due to the squared term exponentially increasing larger errors in comparison to smaller ones.
- **Root Mean Squared Error (RMSE)**: This is the square root of the Mean Square Error.
- **R-squared** is not error, but is a popular metric for accuracy of your model. It represents how close the data are to the fitted regression line. The higher the R-squared, the better the model fits your data. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse).

In [None]:
# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rfr = RandomForestRegressor(n_estimators = 1000, random_state = 42)

# Train the model on training data
rfr.fit(X_train, Y_train)

In [None]:
# Use the random forest's predict method on the test data
Y_hat_test = rfr.predict(X_test)

# Calculate the absolute errors
errors = abs(Y_hat_test - Y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / Y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%')

In [None]:
Y_hat_test.mean()