In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from category_encoders import OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score

In [4]:
df = pd.read_csv("/Users/imac/DAT07-28-AG/ClassMaterial/Unit3/data/bikeshare.csv", parse_dates=["datetime"])

In [5]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,Spring,0,0,Clear Skies,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,40
2,2011-01-01 02:00:00,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,32
3,2011-01-01 03:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,13
4,2011-01-01 04:00:00,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,1


In [26]:
df["hour"] = df["datetime"].dt.hour

In [27]:
# create pipeline first 
pipe = make_pipeline(OrdinalEncoder(), GradientBoostingRegressor())

In [28]:
# declare X and y
X = df.drop(["count", "datetime"], axis=1)
y = df["count"]

In [29]:
# create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

In [30]:
# check the test section is the most recent data / the stuff at the end 
X_test.tail()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour
10881,Winter,0,1,Clear Skies,15.58,19.695,50,26.0027,19
10882,Winter,0,1,Clear Skies,14.76,17.425,57,15.0013,20
10883,Winter,0,1,Clear Skies,13.94,15.91,61,15.0013,21
10884,Winter,0,1,Clear Skies,13.94,17.425,61,6.0032,22
10885,Winter,0,1,Clear Skies,13.12,16.665,66,8.9981,23


In [31]:
X_train.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour
0,Spring,0,0,Clear Skies,9.84,14.395,81,0.0,0
1,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,1
2,Spring,0,0,Clear Skies,9.02,13.635,80,0.0,2
3,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,3
4,Spring,0,0,Clear Skies,9.84,14.395,75,0.0,4


In [48]:
# create validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle=False, test_size=0.2)

In [33]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['season', 'weather'],
                                mapping=[{'col': 'season',
                                          'data_type': dtype('O'),
                                          'mapping': Spring    1
Summer    2
Fall      3
Winter    4
NaN      -2
dtype: int64},
                                         {'col': 'weather',
                                          'data_type': dtype('O'),
                                          'mapping': Clear Skies          1
Partly Cloudy        2
Light Storms/Rain    3
Heavy Storms/Rain    4
NaN                 -2
dtype: int64}])),
                ('gradientboostingregressor', GradientBoostingRegressor())])

In [40]:
pipe.steps[1][1].get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [34]:
# fit on training set
OrdinalEncoder().fit_transform(X_train)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour
0,1,0,0,1,9.84,14.395,81,0.0000,0
1,1,0,0,1,9.02,13.635,80,0.0000,1
2,1,0,0,1,9.02,13.635,80,0.0000,2
3,1,0,0,1,9.84,14.395,75,0.0000,3
4,1,0,0,1,9.84,14.395,75,0.0000,4
...,...,...,...,...,...,...,...,...,...
6961,2,0,0,1,18.04,21.970,26,15.0013,9
6962,2,0,0,1,20.50,24.240,25,15.0013,10
6963,2,0,0,1,20.50,24.240,25,19.0012,11
6964,2,0,0,1,22.14,25.760,24,16.9979,12


In [35]:
# score on val set
pipe.score(X_val, y_val)

0.6269555891884147

In [36]:
# use pipeline for cross val scores
cross_val_score(estimator=pipe, X=X_train, y=y_train, cv=10)

array([-1.10459633, -0.63388175,  0.74789026,  0.87714167,  0.86734897,
        0.84897331,  0.84504906,  0.81513018,  0.65276031,  0.5316119 ])

In [43]:
# use different parameters to check what best version of the model is 
max_depth = [3, 4, 5]
num_trees = [100, 250, 500]
cv_scores = []

for depth in max_depth:
    for tree in num_trees:
        pipe.steps[1][1].set_params(n_estimators=tree, max_depth=depth)
        pipe.fit(X_train, y_train)
        val_score = pipe.score(X_val, y_val)
        cv_dict = {"score": val_score, "max_depth": depth, "n_estimators": tree}
        cv_scores.append(cv_dict)
        print(val_score, depth, tree)

0.6269555891884147 3 100
0.6772545432384518 3 250
0.685612995597679 3 500
0.6723273810986762 4 100
0.6929452923840798 4 250
0.6960658268921616 4 500
0.6957352070995816 5 100
0.6995223652927864 5 250
0.6988551559022924 5 500


In [None]:
# manually set to whatever gets the best reponse 

In [45]:
max(cv_scores, key=lambda x: x["score"])

{'score': 0.6995223652927864, 'max_depth': 5, 'n_estimators': 250}

In [49]:
# use different parameters to check what best version of the model is 
max_depth = [3, 4, 5]
num_trees = [100, 250, 500]
cv_scores = []

for depth in max_depth:
    for tree in num_trees:
        pipe.steps[1][1].set_params(n_estimators=tree, max_depth=depth)
        pipe.fit(X_train, y_train)
        val_score = pipe.score(X_val, y_val)
        cv_dict = {"score": val_score, "max_depth": depth, "n_estimators": tree}
        cv_scores.append(cv_dict)
        
max_params = max(cv_scores, key=lambda x: x["score"])
pipe.steps[1][1].set_params(max_depth=max_params["max_depth"], n_estimators=max_params["n_estimators"])
pipe.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['season', 'weather'],
                                mapping=[{'col': 'season',
                                          'data_type': dtype('O'),
                                          'mapping': Spring    1
Summer    2
Fall      3
Winter    4
NaN      -2
dtype: int64},
                                         {'col': 'weather',
                                          'data_type': dtype('O'),
                                          'mapping': Clear Skies          1
Partly Cloudy        2
Light Storms/Rain    3
Heavy Storms/Rain    4
NaN                 -2
dtype: int64}])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(max_depth=5, n_estimators=250))])

In [50]:
# combine the training and validation sets after best version of model found
# so that using all the data available on best version of model
X_train = pd.concat([X_train, X_val])

In [51]:
y_train = pd.concat([y_train, y_val])

In [52]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['season', 'weather'],
                                mapping=[{'col': 'season',
                                          'data_type': dtype('O'),
                                          'mapping': Spring    1
Summer    2
Fall      3
Winter    4
NaN      -2
dtype: int64},
                                         {'col': 'weather',
                                          'data_type': dtype('O'),
                                          'mapping': Clear Skies          1
Partly Cloudy        2
Light Storms/Rain    3
Heavy Storms/Rain    4
NaN                 -2
dtype: int64}])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(max_depth=5, n_estimators=250))])

In [53]:
# score at the end on the test set
pipe.score(X_test, y_test)

0.6824405912186066