# Gradient boosting machine

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
import pickle
import os
import scipy.stats as st

In [2]:
model_name = "gbm"
aml_dir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY']
train = pd.read_csv(aml_dir + 'nyc_demand_train.csv', parse_dates=['timeStamp'])

Create model pipeline including:
- use randomised search to find optimal hyperparameters
- train best model on all training data

Note: this model will take about 5 minutes to train.

In [3]:
X = train.drop(['demand', 'timeStamp'], axis=1)

In [4]:
regr = GradientBoostingRegressor()
tscv = TimeSeriesSplit(n_splits=3)

In [5]:
param_dist = {'n_estimators': st.randint(3, 100),
                'learning_rate': st.uniform(0.01, 0.1),
                'max_depth': range(2,31),
                'min_samples_leaf': st.randint(1, 100),
                'min_samples_split': st.randint(2, 50),
                'max_features': range(3,X.shape[1]+1),
                'subsample': st.uniform(0.1, 0.9)
             }
regr_cv = RandomizedSearchCV(estimator=regr,
            param_distributions=param_dist,
            n_iter=50,
            cv=tscv,
            scoring='neg_mean_squared_error',
            verbose=2,
            n_jobs=-1)
regr_pipe = Pipeline([('regr_cv', regr_cv)])
regr_pipe.fit(X, y=train['demand'])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.4min finished


Pipeline(memory=None,
     steps=[('regr_cv', RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
          error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
         ...it=True,
          return_train_score=True, scoring='neg_mean_squared_error',
          verbose=2))])

Save model to AML shared directory

In [6]:
with open(aml_dir + model_name + '.pkl', 'wb') as f:
    pickle.dump(regr_pipe, f)

### Cross validation results

In [7]:
cv_results = pd.DataFrame(regr_pipe.named_steps['regr_cv'].cv_results_)
cv_results.sort_values(by='rank_test_score', inplace=True)
cv_results.head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,...,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
8,17.551571,0.080168,-3321.621385,-982.386683,0.0817779,17,9,11,27,53,...,-4822.691439,-1177.051035,-2506.902267,-957.498225,-2635.270449,-812.610789,11.593208,0.015107,1062.709769,149.819335
21,12.326001,0.094168,-4024.647433,-2426.180801,0.0752828,20,9,51,6,87,...,-6689.73485,-3339.789908,-2857.963883,-2210.081344,-2526.243565,-1728.671152,7.471568,0.021067,1889.361056,675.25311
30,7.285266,0.090835,-4182.679505,-2347.43478,0.0850228,25,7,15,40,83,...,-6428.663049,-3076.716039,-3066.970635,-2193.719562,-3052.404831,-1771.86874,4.619184,0.005391,1588.161327,543.677536
33,12.52967,0.054338,-5177.180225,-3842.471151,0.0774501,16,14,58,2,47,...,-8014.074809,-4941.198926,-3704.82111,-3594.551712,-3812.644756,-2991.662815,7.686697,0.010523,2006.470308,814.972719
15,12.243997,0.066834,-5303.148307,-4755.885933,0.0526293,15,14,88,32,88,...,-9392.287437,-6793.790364,-3392.367549,-4224.884666,-3124.789936,-3248.982768,7.443633,0.011849,2893.520758,1495.077921
