# Gradient boosting machine

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
import pickle
import os
import scipy.stats as st

In [2]:
model_name = "gbm"
aml_dir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY']
train = pd.read_csv(aml_dir + 'nyc_demand_train.csv', parse_dates=['timeStamp'])

Create model pipeline including:
- use randomised search to find optimal hyperparameters
- train best model on all training data

In [3]:
X = train.drop(['demand', 'timeStamp'], axis=1)

In [4]:
regr = GradientBoostingRegressor()
tscv = TimeSeriesSplit(n_splits=3)

In [5]:
param_dist = {'n_estimators': st.randint(3, 100),
                'learning_rate': st.uniform(0.01, 0.1),
                'max_depth': range(2,31),
                'min_samples_leaf': st.randint(1, 100),
                'min_samples_split': st.randint(2, 50),
                'max_features': range(3,X.shape[1]+1),
                'subsample': st.uniform(0.1, 0.9)
             }
regr_cv = RandomizedSearchCV(estimator=regr,
            param_distributions=param_dist,
            n_iter=50,
            cv=tscv,
            scoring='neg_mean_squared_error',
            verbose=2,
            n_jobs=-1)
regr_pipe = Pipeline([('regr_cv', regr_cv)])
regr_pipe.fit(X, y=train['demand'])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  8.7min finished


Pipeline(memory=None,
     steps=[('regr_cv', RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
          error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
         ...it=True,
          return_train_score=True, scoring='neg_mean_squared_error',
          verbose=2))])

Save model to AML shared directory

In [6]:
with open(aml_dir + model_name + '.pkl', 'wb') as f:
    pickle.dump(regr_pipe, f)

### Cross validation results

In [7]:
cv_results = pd.DataFrame(regr_pipe.named_steps['regr_cv'].cv_results_)
cv_results.sort_values(by='rank_test_score', inplace=True)
cv_results.head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,...,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
3,52.919514,0.121167,-3092.713388,-537.026169,0.100833,29,16,15,3,54,...,-4493.635259,-690.137933,-2365.938963,-503.145237,-2418.565942,-417.795337,40.145416,0.022169,990.834317,113.73525
9,33.897009,0.134835,-3149.966303,-668.323934,0.107246,13,12,20,28,88,...,-4527.739103,-790.583881,-2499.033277,-649.614421,-2423.12653,-564.773501,18.379932,0.022407,974.725217,93.131149
11,110.824195,0.1865,-3479.907006,-279.774457,0.0665966,16,14,5,15,66,...,-4884.156237,-307.045679,-2748.994289,-276.042264,-2806.570494,-256.235429,78.53024,0.015346,993.232326,20.910401
35,10.83367,0.075667,-4350.17484,-1792.096751,0.0927124,7,7,9,6,98,...,-5798.827948,-1520.454164,-3750.812713,-1903.083424,-3500.883859,-1952.752667,5.006992,0.015547,1029.421513,193.147664
40,21.595172,0.085834,-4520.704301,-3195.010351,0.0907159,11,16,79,33,83,...,-7575.178363,-4336.882875,-3149.479692,-2955.237899,-2837.45485,-2292.910277,12.142449,0.022396,2163.592494,851.498333
