# **First Approach**: Hyperparameter Tuning for Decision Tree Regressor
 
Since Decision Tree Regressor is the best performer compared to the other models regarding the cross-validation method, we choose Decision Tree Regressor in this case for hyperparameter tuning. We use the Grid Search method to search for the best hyperparameter and then use the hyperparameters we got from the search and apply them to the model. After, we will compare the result of the before and after-tuned hyperparameter models. From our development, we see that the model after the hyperparameter is tuned gives better results regarding the RMSE score from both hold-out and cross-validation evaluation.  

### Train-Test Split (80-20) -- Regularization

In [98]:
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test
%store -r ind_vars
%store -r dep_var
%store -r X
%store -r y
%store -r df

def rmse(a,b):
    return np.sqrt(np.mean((a-b)**2))

### Linear Regression

In [99]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Linear Regression model RMSE, MSE, R2 results
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)
# results based on test data
y_pred = reg.predict(X_test)
print("RMSE:" + str(rmse(y_test, y_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_test, y_pred)))
print("R2:" + str(r2_score(y_test, y_pred)))

RMSE:7064.543640307604
Mean Squared Error:49907776.84581061
R2:0.9589910139299496


In [100]:
# Linear regression results based on train data
y_train_pred = reg.predict(X_train)
print("RMSE:" + str(rmse(y_train, y_train_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_train, y_train_pred)))
print("R2:" + str(r2_score(y_train, y_train_pred)))

RMSE:7138.763721747759
Mean Squared Error:50961947.47494191
R2:0.9589129226026707


### Elastic Net

In [101]:
# Elastic Net model RMSE, MSE, R2 results
from sklearn.linear_model import ElasticNet

enr = ElasticNet()
enr.fit(X_train, y_train)
y_pred = enr.predict(X_test)
print("RMSE:" + str(rmse(y_test, y_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_test, y_pred)))
print("R2:" + str(r2_score(y_test, y_pred)))

RMSE:36280.82875065643
Mean Squared Error:683482291.6436001
R2:0.4383858078124082


In [102]:
# Elastic Net results based on train data
y_train_pred = enr.predict(X_train)
print("RMSE:" + str(rmse(y_train, y_train_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_train, y_train_pred)))
print("R2:" + str(r2_score(y_train, y_train_pred)))

RMSE:36601.404051881764
Mean Squared Error:698959805.1265999
R2:0.4364772730677209


### Lasso Regression

In [103]:
# Lasso Regression model and RMSE, MSE results
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=10)
lasso_reg.fit(X_train, y_train)
y_pred = lasso_reg.predict(X_test)
print("RMSE:" + str(rmse(y_test, y_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_test, y_pred)))
print("R2:" + str(r2_score(y_test, y_pred)))

RMSE:48942.192804157065
Mean Squared Error:51343015.62755987
R2:0.9578116849570364


In [104]:
# Lasso regression results based on train data
y_train_pred = lasso_reg.predict(X_train)
print("RMSE:" + str(rmse(y_train, y_train_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_train, y_train_pred)))
print("R2:" + str(r2_score(y_train, y_train_pred)))

RMSE:49177.35139422539
Mean Squared Error:52157080.74892002
R2:0.957949369682088


### SGD Regressor

In [105]:
# SGD Regressor odel and RMSE, MSE, R2 results
from sklearn.linear_model import SGDRegressor

clf = SGDRegressor(tol=1e-3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("RMSE:" + str(rmse(y_test, y_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_test, y_pred)))
print("R2:" + str(r2_score(y_test, y_pred)))

  y = column_or_1d(y, warn=True)


RMSE:48877.96264522262
Mean Squared Error:53425358.15038619
R2:0.9561006338762502


In [106]:
# SGD regression results based on train data
y_train_pred = clf.predict(X_train)
print("RMSE:" + str(rmse(y_train, y_train_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_train, y_train_pred)))
print("R2:" + str(r2_score(y_train, y_train_pred)))

RMSE:49132.43274222368
Mean Squared Error:53641245.3561951
R2:0.9567527908794535


### Decision Tree Regressor (MAIN)

In [107]:
# Decision Tree Regressor and RMSE, MSE results -- best performing model for approach 1
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
y_pred = lasso_reg.predict(X_test)
print("RMSE:" + str(rmse(y_test, y_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_test, y_pred)))
print("R2:" + str(r2_score(y_test, y_pred)))

RMSE:48942.192804157065
Mean Squared Error:51343015.62755987
R2:0.9578116849570364


In [108]:
# get importance
importance = dtr.feature_importances_
# summarize feature importance
for a,b in zip(ind_vars, importance):
	print(a, b)

total_cases 0.8852308890127735
new_cases 1.615812674482884e-05
new_deaths 5.278637024552044e-05
reproduction_rate 0.00013344676438886764
icu_patients 0.0005119393975910734
hosp_patients 9.051440625790422e-05
total_tests 0.06347292660572788
new_tests 2.4709047397695195e-06
positive_rate 0.000566975741229598
tests_per_case 3.97810346005335e-05
total_vaccinations 0.0009000164699290095
people_vaccinated 0.0005731635142816444
people_fully_vaccinated 0.002658120166017643
new_vaccinations 1.962398320831818e-05
new_people_vaccinated_smoothed 7.221796350239494e-05
stringency_index 0.00026135882532468327
population 0.026850387713382773
population_density 0.0007004146251540388
median_age 0.002294680295881222
aged_65_older 0.0031888693036596777
aged_70_older 1.0664248362189202e-09
gdp_per_capita 9.459792648870627e-05
extreme_poverty 0.00026535344547931513
cardiovasc_death_rate 5.895212799198614e-05
diabetes_prevalence 1.1479663292782217e-05
female_smokers 0.0016591751617159901
male_smokers 0.00525

In [109]:
# Decision Tree regression results based on train data
y_train_pred = dtr.predict(X_train)
print("RMSE:" + str(rmse(y_train, y_train_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_train, y_train_pred)))
print("R2:" + str(r2_score(y_train, y_train_pred)))

RMSE:49806.42621478479
Mean Squared Error:0.0
R2:1.0


### Random Forest Regressor

In [110]:
# Random Forest and RMSE, MSE, R2 results
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print("RMSE:" + str(rmse(y_test, y_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_test, y_pred)))
print("R2:" + str(r2_score(y_test, y_pred)))

  rfr.fit(X_train, y_train)


RMSE:49044.786532821665
Mean Squared Error:17321686.87345887
R2:0.985766851168345


In [111]:
# Random Forest regression results based on train data
y_train_pred = rfr.predict(X_train)
print("RMSE:" + str(rmse(y_train, y_train_pred)))
print("Mean Squared Error:" + str(mean_squared_error(y_train, y_train_pred)))
print("R2:" + str(r2_score(y_train, y_train_pred)))

RMSE:49526.52632128954
Mean Squared Error:15133220.23018828
R2:0.9877991359891749


### Cross validation
We apply cross-validation with k=5 folds to estimate the performance of the machine learning model on unseen data (non-training data). The Decision Tree Regressor is the best performer in terms of MSE while linear regression is the worst performer in terms of the RMSE score.

In [112]:
from sklearn.model_selection import cross_val_score

# result for Linear Regression with cross valisation and RMSE scores, mean, standard deviation
def display_scores(scores):
    print("Scores:", np.sqrt(-scores))
    print("Mean:", np.sqrt(-scores).mean())
    print("Standard deviation:", np.sqrt(-scores).std())

### Linear Regression

In [113]:
reg = LinearRegression()
#scores = cross_val_score(reg, X, y, np.ravel(y), cv=5,scoring='neg_mean_squared_error')
scores = cross_val_score(reg, X, y, cv=5,scoring='neg_mean_squared_error')
display_scores(scores)

Scores: [13752.52130363 11688.38733639 12599.55886119 30235.38296593
 13180.05915096]
Mean: 16291.181923619486
Standard deviation: 7005.403974159406


### Elastic Net

In [114]:
# result for ElasticNet with cross validation and RMSE scores, mean, standard deviation
enr = ElasticNet()
scores = cross_val_score(enr, X, y, cv=5,scoring='neg_mean_squared_error')
display_scores(scores)

Scores: [13964.0081663  30336.13582157 31123.46956271 19256.77729951
 39416.09996319]
Mean: 26819.29816265587
Standard deviation: 9077.70518201681


### Lasso Regression

In [115]:
# result for Lasso Regression with cross validation and RMSE scores, mean, standard deviation
lasso_reg = Lasso(alpha=10)
scores = cross_val_score(lasso_reg, X, y, cv=5,scoring='neg_mean_squared_error')
display_scores(scores)

Scores: [12936.12603374 10214.45587422 12511.49416854 27743.25905593
 11698.83801519]
Mean: 15020.834629521833
Standard deviation: 6428.682929017829


### SGD Regressor

In [116]:
# result for SGD Regressor with cross validation and RMSE score, mean, standard deviation
clf = SGDRegressor(max_iter=1000, tol=1e-3)
scores = cross_val_score(clf, X, y, cv=5,scoring='neg_mean_squared_error')
display_scores(scores)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Scores: [14398.26253677  9822.31224414 12325.323009   28199.47252297
 11847.04529093]
Mean: 15318.483120761744
Standard deviation: 6602.794503375161


### Decision Tree Regressor

In [117]:
# result for Decision Tree Regressor with cross validation and RMSE score, mean, standard deviation
dtr = DecisionTreeRegressor()
scores = cross_val_score(dtr, X, y, cv=5,scoring='neg_mean_squared_error')
display_scores(scores)

Scores: [ 7884.98154711 13555.58359817 12192.43199146  9836.18309743
 17439.53314784]
Mean: 12181.742676400083
Standard deviation: 3270.9545922277207


### Random Forest Regressor

In [118]:
# result for Random Forest Regressor with cross validation and RMSE score, mean, standard deviation
rfr = RandomForestRegressor(n_estimators=50, max_leaf_nodes=16, n_jobs=-1)
scores = cross_val_score(rfr, X, y, cv=5,scoring='neg_mean_squared_error')
display_scores(scores)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Scores: [ 7138.83474172  9097.52204026 10285.65790418 10416.03989493
 12028.27810299]
Mean: 9793.266536816933
Standard deviation: 1622.0752864437839


We do the hold-out method by splitting the data into train-test sets of 80-20 for all the chosen models above. Then we compare the RMSE, MSE, and R2 of each model. We noted that Linear Regression is the best performer in this case.
 
However, we apply cross-validation with k=5 folds. The Decision Tree Regressor is the best performer with an average slightest error, while Linear regression worst performer, in relation to the RMSE score.
 
Additionally, the Linear Regression model seems to be overfitting when we apply the hold-out method of train-test of 80-20. The training performance gives better results than the testing performance.


### Hyperparameter Tuning and Validation

Since Decision Tree Regressor is the best performer compared to the other models regarding the cross-validation method, we choose Decision Tree Regressor in this case for hyperparameter tuning. We use the Grid Search method to search for the best hyperparameter and then use the hyperparameters we got from the search and apply them to the model. After, we will compare the result of the before and after-tuned hyperparameter models. From our development, we see that the model after the hyperparameter is tuned gives better results regarding the RMSE score from both hold-out and cross-validation evaluation.  

In [119]:
from sklearn.model_selection import GridSearchCV
from datetime import datetime

# Hyper parameters range intialization for tuning 
parameters={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }

# calculating different regression metrics
tuning_model=GridSearchCV(dtr,param_grid=parameters,scoring='neg_mean_squared_error',cv=3,verbose=3)

# function for calculating how much time take for hyperparameter tuning
def timer(start_time=None):
    if not start_time:
        start_time=datetime.now()
        return start_time
    elif start_time:
        thour,temp_sec=divmod((datetime.now()-start_time).total_seconds(),3600)
        tmin,tsec=divmod(temp_sec,60)
        #print(thour,":",tmin,':',round(tsec,2))

In [120]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [121]:
%%capture

start_time=timer(None)

tuning_model.fit(X,y)

timer(start_time)

In [122]:
# best hyperparameters 
tuning_model.best_params_

{'max_depth': 7,
 'max_features': 'auto',
 'max_leaf_nodes': 10,
 'min_samples_leaf': 2,
 'min_weight_fraction_leaf': 0.1,
 'splitter': 'best'}

In [123]:
# best model score
tuning_model.best_score_

-0.0003876787787329035

In [124]:
tuned_hyper_model= DecisionTreeRegressor(max_depth=5,max_features='auto',max_leaf_nodes=50,min_samples_leaf=2,min_weight_fraction_leaf=0.1,splitter='random')

In [125]:
# fitting model
tuned_hyper_model.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=5, max_features='auto', max_leaf_nodes=50,
                      min_samples_leaf=2, min_weight_fraction_leaf=0.1,
                      splitter='random')

In [126]:
# prediction 
tuned_pred=tuned_hyper_model.predict(X_test)

### Evaluating the Performance of Our Model

The Linear regression model seem to be overfitting when the testing data create more error than training data.


In [127]:
from sklearn import metrics

# Decision tree regressor with MAE, MSE, RMSE results, with hyperparameter tuned 
print('MAE:', metrics.mean_absolute_error(y_test,tuned_pred))
print('MSE:', metrics.mean_squared_error(y_test, tuned_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, tuned_pred)))

# Decsion Tree Regressor with cross validation RMSE score, standard deviaton, with hyperparameter tuned
scores = cross_val_score(tuned_hyper_model, X, y, cv=3, scoring='neg_mean_squared_error')
display_scores(scores)

MAE: 7195.044235459026
MSE: 147069244.8879156
RMSE: 12127.21092782325
Scores: [0.02004596 0.04438509 0.02365256]
Mean: 0.0293612043809284
Standard deviation: 0.010725039561241267
