In [4]:

# for data manipulation
import pandas as pd
import numpy as np

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# make a nice white grid when plotting
sns.set_style("whitegrid")


In [39]:
df_train = pd.read_csv('/Users/brandonowens/Downloads/cbg_no2020_gt3crashes_feature_select_and_transform_train.csv')
df_train.head()

Unnamed: 0,census_block_group,CountHU,Pct_AO0,Pct_AO1,Pct_AO2p,D3A,D3AAO,D3AMM,D3BAO,D3BMM3,D3BMM4,D3BPO3,D3BPO4,D4B025,D4B050,D5AR,D5CRI,NatWalkInd,log_crash_per_density
0,11656,2.713491,-0.941763,0.289673,0.596977,1.325384,-3.0,0.739317,-2.0,0.9871,1.287906,63.032625,14.54599,0.0,0.0,283581.0,0.518476,10.333333,0.877227
1,42938,3.091315,-3.0,0.310526,0.689474,0.46036,-0.771369,-0.019542,-0.34806,0.037986,-1.694607,2.621897,0.295856,0.0,0.0,382.0,0.0,5.5,3.442839
2,169884,2.790988,-0.916025,0.334025,0.545643,0.363566,-0.252747,-1.533631,0.119856,-2.0,-2.0,1.74376,0.186831,0.0,0.0,9935.0,0.557457,2.5,2.077353
3,198338,2.859138,-1.000181,0.194444,0.706597,0.344122,-2.813679,-0.354128,-1.186518,0.087026,-0.692936,1.900439,0.165256,0.0,0.0,2232.0,0.04297,3.0,2.908564
4,6402,2.815578,-0.699033,0.512864,0.288165,1.486949,0.308425,0.307001,1.195819,1.49671,-2.0,250.993067,0.0,0.0,0.0,437784.0,0.800408,15.0,-0.242799


## Splitting training set into train and validation sets.

In [40]:
from sklearn.model_selection import train_test_split

df_tt, df_val = train_test_split(df_train, test_size=0.2, shuffle = True, random_state=831)

In [51]:
# Get rid of census block column and normalized crashes column.
features = df_tt.columns[1:-1]

# single out the target variable
target = df_tt.columns[-1]

## Fitting some models

In [104]:
# import our performance metrics for the models
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
# create an empty dictionary to store rmses and maes for the different models
rmses_tt = {}
maes_tt = {} # using Mean Absolute Error since it is less sensitive to outliers

rmses_val = {}
maes_val = {} 

In [105]:
# import our first model, then fit and predict on validation set
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()

mlr.fit(df_tt[features], df_tt[target])

mlr_tt_pred = mlr.predict(df_tt[features])
mlr_pred = mlr.predict(df_val[features])

# storing the tt rmse
rmses_tt['mlr'] = round(root_mean_squared_error(df_tt[target], mlr_tt_pred), 2)
#storing the tt mae
maes_tt['mlr'] = round(mean_absolute_error(df_tt[target], mlr_tt_pred), 2)
print('Root mean squared errors on training set are', rmses_tt)
print('Mean aboslute errors on training set are', maes_tt)

# storing the val rmse
rmses_val['mlr'] = round(root_mean_squared_error(df_val[target], mlr_pred), 2)
#storing the val mae
maes_val['mlr'] = round(mean_absolute_error(df_val[target], mlr_pred), 2)
print('Root mean squared errors on validation set are', rmses_val)
print('Mean aboslute errors on validation set are', maes_val)

Root mean squared errors on training set are {'mlr': 0.63}
Mean aboslute errors on training set are {'mlr': 0.51}
Root mean squared errors on validation set are {'mlr': 0.63}
Mean aboslute errors on validation set are {'mlr': 0.51}


## Let's also look at Ridge and Lasso Regression

In [88]:
# We'll do 5-fold cv to get a good idea of the performance 
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=831)



In [101]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

alphas = [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]

# arrays to store rmses for each of the folds at each value of alpha
ridge_rmses = np.zeros((len(alphas), 5))
lasso_rmses = np.zeros((len(alphas), 5))

for i, alph in enumerate(alphas):
    for j, (train_index, test_index) in enumerate(kfold.split(df_tt)):

        df_ttt = df_tt.iloc[train_index]
        df_ho = df_tt.iloc[test_index]

        # set up the pipelines
        ridge_pipe = Pipeline([('scale', StandardScaler()),
                       ('ridge', Ridge(alpha=alph))])

        lasso_pipe = Pipeline([('scale', StandardScaler()),
                       ('lasso', Lasso(alpha=alph))])
        
        # fit the models
        ridge_pipe.fit(df_ttt[features], df_ttt[target])
        lasso_pipe.fit(df_ttt[features], df_ttt[target])

        # make predictions
        ridge_preds = ridge_pipe.predict(df_ho[features])
        lasso_preds = lasso_pipe.predict(df_ho[features])

        # record the rmses
        ridge_rmses[i,j] = root_mean_squared_error(df_ho[target], ridge_preds)
        lasso_rmses[i,j] = root_mean_squared_error(df_ho[target], lasso_preds)
    


        
print('Average RMSEs for Ridge regression:', np.mean(ridge_rmses, axis=1))

print('Average RMSEs for Lasso regression:', np.mean(lasso_rmses, axis=1))



Average RMSEs for Ridge regression: [0.63202597 0.63202597 0.63202597 0.63202597 0.63202597 0.63202595
 0.63202595 0.63204026 0.63331127]
Average RMSEs for Lasso regression: [0.63202597 0.63202651 0.63207368 0.63491714 0.66028066 1.06739374
 1.06739374 1.06739374 1.06739374]


RMSEs for both Ridge and Lasso for various values of alpha dont seem to be much better than the base multiple linear regression.

## Let's look a some more complex models. We'll start with Random Forest Regressor.

In [102]:
# Let's fit an out-of-the-box random forest model to our data and see how well it does.
from sklearn.ensemble import RandomForestRegressor

oob_rfr = RandomForestRegressor(random_state=831)

oob_rfr.fit(df_tt[features], df_tt[target])


In [106]:
rmses_tt['oob_rfr'] = round(root_mean_squared_error(df_tt[target], oob_rfr.predict(df_tt[features])),2)
maes_tt['oob_rfr'] = round(root_mean_squared_error(df_tt[target], oob_rfr.predict(df_tt[features])),2)

print('Root mean squared errors on training set are', rmses_tt)
print('Mean aboslute errors on training set are', maes_tt)

rmses_val['oob_rfr'] = round(root_mean_squared_error(df_val[target], oob_rfr.predict(df_val[features])),2)
maes_val['oob_rfr'] = round(mean_absolute_error(df_val[target], oob_rfr.predict(df_val[features])), 2)

print('Root mean squared errors on validation set are', rmses_val)
print('Mean aboslute errors on validation set are', maes_val)

Root mean squared errors on training set are {'mlr': 0.63, 'oob_rfr': 0.21}
Mean aboslute errors on training set are {'mlr': 0.51, 'oob_rfr': 0.21}
Root mean squared errors on validation set are {'mlr': 0.63, 'oob_rfr': 0.56}
Mean aboslute errors on validation set are {'mlr': 0.51, 'oob_rfr': 0.45}


In [54]:
feature_importances = pd.DataFrame({'feature importance score':oob_rfr.feature_importances_}, index = features).sort_values(by='feature importance score', ascending = False)
print(feature_importances)


            feature importance score
D3A                         0.537017
D3BAO                       0.115174
D3BPO3                      0.043939
D5AR                        0.039837
D3AAO                       0.036029
D3BMM3                      0.029308
CountHU                     0.025809
D3BMM4                      0.023527
D3BPO4                      0.023518
D5CRI                       0.023250
D3AMM                       0.022396
NatWalkInd                  0.020428
Pct_AO0                     0.017385
Pct_AO2p                    0.017029
Pct_AO1                     0.016627
D4B050                      0.006404
D4B025                      0.002321


Let's recall: `D3A = Total road network density` and `D3BA0 = Intersection density in terms of auto-oriented intersections
per square mile`. The feature importance scores associated with our out-of-the-box random forest regressor indicate that these are the features in our dataframe with the most impact on the number of crashes (weighted by severity). Is it surprising that D3a is five times as impactful as D3bao? 

Let's do a grid search to figure out what the optimal max depth and n_estimators could be.

In [62]:

from sklearn.model_selection import GridSearchCV




# The parameter ranges can be changed once we have a better idea of a good range
grid_cv = GridSearchCV(RandomForestRegressor(),
                       param_grid= {'max_depth': range(1,11),
                       'n_estimators': [50,100,150,200]},
                                    scoring = 'neg_mean_squared_error',
                                    cv = 5)

Running the following cell took my computer over 4 hours...

In [63]:
grid_cv.fit(df_tt[features], df_tt[target])

In [64]:
grid_cv.best_params_



{'max_depth': 10, 'n_estimators': 200}

Best hyperparameters for our random forest model are max_depth=10 and n_estimators=200.

In [65]:
pd.DataFrame({'feature importance score':grid_cv.best_estimator_.feature_importances_}, index=features).sort_values(by= 'feature importance score', ascending=False)


Unnamed: 0,feature importance score
D3A,0.693816
D3BAO,0.135919
D3BPO3,0.035568
D3AAO,0.027882
D5AR,0.024065
D3BMM3,0.017719
D3BMM4,0.011254
D3BPO4,0.009366
CountHU,0.007787
D3AMM,0.007631


Again, it seems like `D3A` is doing most of the heavy lifting. I wonder how correlated that variable is with our target. Let's check.

In [73]:
correlation_matrix = np.corrcoef(df_tt['D3A'], df_tt['log_crash_per_density'])
print(correlation_matrix)

[[ 1.         -0.73528722]
 [-0.73528722  1.        ]]


hmmm. interesting.

In [70]:
# Let's look at the root mean squared error on the for the grid cv rfr on the training set.
np.sqrt(-1*grid_cv.best_score_)

np.float64(0.5759076053115321)

In [71]:
# and then look at the rmse on the validation set
print(root_mean_squared_error(df_val[target], grid_cv.best_estimator_.predict(df_val[features])))

0.567578012729294


The root mean squared error is basically the same on our training and validation sets. It seems like the model is generalizing to unseen data pretty well!

In [87]:
10**0.567

3.6897759857015027

If we transform the log-10 normalized target variable back into severity weighted crashes (apply 10^x), then our RMSE comes out to about 3.69 severity weighted crashes. Not too bad.

## Let's try XGBoost now.

In [76]:
from xgboost import XGBRegressor

# instantiate the models
oob_xgb_reg = XGBRegressor()

xgb_reg1 = XGBRegressor(learning_rate=.1,
                                   max_depth=1,
                                   n_estimators=10)

xgb_reg2 = XGBRegressor(learning_rate=1,
                                   max_depth=1,
                                   n_estimators=10)

# fit the models
oob_xgb_reg.fit(df_tt[features], df_tt[target])
xgb_reg1.fit(df_tt[features], df_tt[target])
xgb_reg2.fit(df_tt[features], df_tt[target])

#get the training performance metrics
rmses_tt['oob_xgbr'] = round(root_mean_squared_error(df_tt[target], oob_xgb_reg.predict(df_tt[features])),2)
maes_tt['oob_xgbr'] = round(mean_absolute_error(df_tt[target], oob_xgb_reg.predict(df_tt[features])), 2)

rmses_tt['xgbr1'] = round(root_mean_squared_error(df_tt[target], xgb_reg1.predict(df_tt[features])),2)
maes_tt['xgbr1'] = round(mean_absolute_error(df_tt[target], xgb_reg1.predict(df_tt[features])), 2)

rmses_tt['xgbr2'] = round(root_mean_squared_error(df_tt[target], xgb_reg2.predict(df_tt[features])),2)
maes_tt['xgbr2'] = round(mean_absolute_error(df_tt[target], xgb_reg2.predict(df_tt[features])), 2)


print('Root mean squared errors on the training set are', rmses_tt)
print('Mean aboslute errors on the training set are', maes_tt)

# get the validation performance metrics
rmses_val['oob_xgbr'] = round(root_mean_squared_error(df_val[target], oob_xgb_reg.predict(df_val[features])),2)
maes_val['oob_xgbr'] = round(mean_absolute_error(df_val[target], oob_xgb_reg.predict(df_val[features])), 2)

rmses_val['xgbr1'] = round(root_mean_squared_error(df_val[target], xgb_reg1.predict(df_val[features])),2)
maes_val['xgbr1'] = round(mean_absolute_error(df_val[target], xgb_reg1.predict(df_val[features])), 2)

rmses_val['xgbr2'] = round(root_mean_squared_error(df_val[target], xgb_reg2.predict(df_val[features])),2)
maes_val['xgbr2'] = round(mean_absolute_error(df_val[target], xgb_reg2.predict(df_val[features])), 2)


print('Root mean squared errors on the validation set are', rmses_val)
print('Mean aboslute errors on the validation are', maes_val)

Root mean squared errors on the training set are {'mlr': 0.63, 'oob_rfr': 0.21, 'xgbr1': 0.81, 'xgbr2': 0.66, 'oob_xgbr': 0.47}
Mean aboslute errors on the training set are {'mlr': 0.51, 'oob_rfr': 0.21, 'xgbr1': 0.64, 'xgbr2': 0.53, 'oob_xgbr': 0.38}
Root mean squared errors on the validation set are {'mlr': 0.63, 'oob_rfr': 0.56, 'xgbr1': 0.8, 'xgbr2': 0.66, 'oob_xgbr': 0.55}
Mean aboslute errors on the validation are {'mlr': 0.51, 'oob_rfr': 0.45, 'xgbr1': 0.64, 'xgbr2': 0.53, 'oob_xgbr': 0.44}


Looks like the out-of-box xgboost performed (0.55) a tiny bit better on the validation set than our best random forest model (0.568). There is a bit more of a difference between the test and validation set performances though when using oob_xgbr in this instance. Let's try to see what things look like after 5-fold cross validation. (I would prefer to use GridSearchCV but it doesn't want to play nicely with XGBoost for some reason.)


In [77]:
xgb_rmses = []

for i, (train_index, test_index) in enumerate(kfold.split(df_tt)):
    df_ttt = df_tt.iloc[train_index]
    df_ho = df_tt.iloc[test_index]

    # train base model
    oob_xgb_reg.fit(df_ttt[features], df_ttt[target])

    # predict on holdout set
    xgb_pred = oob_xgb_reg.predict(df_ho[features])

    # get the rmses
    xgb_rmses.append(root_mean_squared_error(df_ho[target], xgb_pred))

print(f"Base XGBoost Cross-validation RMSE: {np.mean(xgb_rmses)}")


Base XGBoost Cross-validation RMSE: 0.5613341584275047


Now let's look at the performance on the validation set.

In [84]:
xgb_rmses_val = []

for i, (train_index, test_index) in enumerate(kfold.split(df_val)):
    df_ttt = df_val.iloc[train_index]
    df_ho = df_val.iloc[test_index]

    # train base model
    oob_xgb_reg.fit(df_ttt[features], df_ttt[target])

    # predict on holdout set
    xgb_pred = oob_xgb_reg.predict(df_ho[features])

    # get the rmses
    xgb_rmses_val.append(root_mean_squared_error(df_ho[target], xgb_pred))

print(f"Base XGBoost CV RMSE on validation set: {np.mean(xgb_rmses_val)}")

Base XGBoost CV RMSE on validation set: 0.5851378978183485


So without messing around with the hyperparameters, the out-of-box XGBoost model is performing about as well on average as our best Random Forest model on the training set, and a tiny bit worse on the validation set. 

In [83]:
print(pd.DataFrame({'OOB XGB feature importance scores': oob_xgb_reg.feature_importances_}, index = features).sort_values(by='OOB XGB feature importance scores', ascending = False ))

            OOB XGB feature importance scores
D3A                                  0.549773
D3BAO                                0.154799
D3BPO3                               0.056795
D3AAO                                0.043251
D3BMM3                               0.031156
D4B050                               0.028547
D5AR                                 0.022383
D3BMM4                               0.021548
D3BPO4                               0.015679
D5CRI                                0.011901
D3AMM                                0.011346
NatWalkInd                           0.010686
Pct_AO2p                             0.010057
CountHU                              0.009017
D4B025                               0.008248
Pct_AO0                              0.007785
Pct_AO1                              0.007028


In [None]:
xgbr = XGBRegressor()

## Now let's try support vector regression

In [79]:
# First let's try support vector regression out of the box
from sklearn.svm import SVR

# Let's also try bagging it for good measure
from sklearn.ensemble import BaggingRegressor

svr1 = SVR() 
bag1 = BaggingRegressor(estimator=svr1,
                       n_estimators=100,
                       bootstrap=True,
                       random_state=831,
                       n_jobs=1)


rmses_base_svr1 = []
rmses_bagged_svr1 = []
maes_base_svr1 = []
maes_bagged_svr1 = []


for i, (train_index, test_index) in enumerate(kfold.split(df_tt)):
    df_ttt = df_tt.iloc[train_index]
    df_ho = df_tt.iloc[test_index]

    #train base model
    svr1.fit(df_ttt[features], df_ttt[target])
    

    #train bagged model
    bag1.fit(df_ttt[features], df_ttt[target])

    # predict base model on holdout set 
    svr1_pred = svr1.predict(df_ho[features])

    # predict bagged model on holdout set
    bag1_pred = bag1.predict(df_ho[features])

    #compute rmses
    rmses_base_svr1.append(root_mean_squared_error(df_ho[target], svr1_pred))
    rmses_bagged_svr1.append(root_mean_squared_error(df_ho[target], bag1_pred))

    #compute maes
    maes_base_svr1.append(mean_absolute_error(df_ho[target], svr1_pred))
    maes_bagged_svr1.append(mean_absolute_error(df_ho[target], bag1_pred))


print(f"Base SVR Cross-validation RMSE: {np.mean(rmses_base_svr1)}")
print(f"Bagged Cross-Validation RMSES: {np.mean(rmses_bagged_svr1)}")
print(f"Base SVR Cross-validation MAES: {np.mean(maes_base_svr1)}")
print(f"Bagged Cross-Validation MAES: {np.mean(maes_bagged_svr1)}")





Base SVR Cross-validation RMSE: 0.8726367220016333
Bagged Cross-Validation RMSES: 0.8726105659153512
Base SVR Cross-validation MAES: 0.7004752410618638
Bagged Cross-Validation MAES: 0.7004718785293711


That took more than 10 hours... Didn't perform too well either...