In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
df_bikes = pd.read_csv('../data/bike_rentals_cleaned.csv')

X_bikes = df_bikes.iloc[:,:-1]
y_bikes = df_bikes.iloc[:,-1]

In [5]:
reg = DecisionTreeRegressor(random_state=2)
scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)

rmse = np.sqrt(-scores)

print('RMSE mean: %0.2f' % (rmse.mean()))

RMSE mean: 1233.36


In [8]:
# Check how well decision tree makes predictions on the training set alone

X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)

from sklearn.metrics import mean_squared_error
reg_mse = mean_squared_error(y_train, y_pred)
reg_rmse = np.sqrt(reg_mse)

print(reg_rmse)



0.0


In [10]:
# Max Depth Hyperparameter, determined by number
# of time splits are made. By limiting, variance is reduced
from sklearn.model_selection import GridSearchCV
params = {'max_depth':[None,2,3,4,6,8,10,20]}
reg = DecisionTreeRegressor(random_state=2)
grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_reg.fit(X_train, y_train)

In [14]:
# GridSearchCV has been fit on data, now view the best hyperparameters as follows:
best_params = grid_reg.best_params_
# Max depth of 6 results in best cross validation score
print("Best params:", best_params)

best_score =  np.sqrt(-grid_reg.best_score_)
print("Training score: {:.3f}".format(best_score))

Best params: {'max_depth': 6}
Training score: 951.398


In [15]:
best_model = grid_reg.best_estimator_

y_pred = best_model.predict(X_test)

rmse_test = mean_squared_error(y_test, y_pred)**0.5

print('Test score: {:.3f}'.format(rmse_test))

Test score: 864.670


In [None]:
# min_samples_leaf provides a restriction on number of samples that a leaf may have. Designed to reduce overfitting
# Default = 1
def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):

    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

    grid_reg.fit(X_train, y_train)

    best_params = grid_reg.best_params_
    print("Best params:", best_params)
    best_score = np.sqrt(-grid_reg.best_score_)
    print("Training score: {:.3f}".format(best_score))

    y_pred = grid_reg.predict(X_test)
    rmse_test = mean_squared_error(y_test, y_pred)**0.5

    print('Test score: {:.3f}'.format(rmse_test))