In [21]:
import pandas as pd
import numpy as np
import xgboost as xgb
import catboost as cb
import lightgbm as lgbm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math
import scipy.stats as st

In [2]:
train_rossman = pd.read_csv('/Users/cmcnamara/Downloads/Kaggle Rossman/train.csv')
store_rossman = pd.read_csv('/Users/cmcnamara/Downloads/Kaggle Rossman/store.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train_rossman = pd.merge(train_rossman, store_rossman, on='Store', how='left').fillna(0)

In [4]:
train_rossman['StateHoliday'] = train_rossman['StateHoliday'].astype('category').cat.codes
train_rossman['StoreType'] = train_rossman['StoreType'].astype('category').cat.codes
train_rossman['Assortment'] = train_rossman['Assortment'].astype('category').cat.codes
train_rossman['Promo2SinceWeek'] = train_rossman['Promo2SinceWeek'].astype(int)
train_rossman['Promo2SinceYear'] = train_rossman['Promo2SinceYear'].astype(int)
train_rossman['CompeititonDistance'] = train_rossman['CompetitionDistance'].astype(int)
train_rossman['CompetitionOpenSinceMonth'] = train_rossman['CompetitionOpenSinceMonth'].astype(int)
train_rossman['CompetitionOpenSinceYear'] = train_rossman["CompetitionOpenSinceYear"].astype(int)
train_rossman['PromoInterval'] = train_rossman['PromoInterval'].astype('category').cat.codes
train_rossman['Date'] = pd.to_datetime(train_rossman['Date'])

In [5]:
test_rossman = train_rossman.iloc[np.where(train_rossman.Date.dt.year == 2015)[0], :]
train_rossman = train_rossman.iloc[np.where(train_rossman.Date.dt.year == 2014)[0], :]

In [6]:
train_rossman = train_rossman.iloc[::-1]
test_rossman = test_rossman.iloc[::-1]

Note that there are a few problems when comparing the different algorithms (xgboost, lightgbm, catboost).

* The first is that they take the input data in differently. While  xgboost and lightgbm have wrappers to handle pandas dataframes directly, catboost does not. Therefore, I will use their native data wrapper for each algorithm.
* The second is that they do not grow the trees the same.

Nonetheless, I will try to ensure that each algorithm is as close to one another as possible for maximum accuracy."

In [7]:
train_y = train_rossman['Sales']
train_x = train_rossman.drop(['Sales', 'Date'], axis=1)
params = {"max_depth": [5,10, 15],
         "learning_rate": [0.01, 0.05, 0.1],
         "n_estimators": [100, 250]}
params_cb = {'depth': [5, 10, 15],
            'learning_rate': [0.01, 0.05, 0.1],
            'iterations': [100, 250]}
test_y = test_rossman['Sales']
test_x = test_rossman.drop(['Sales', 'Date'], axis=1)
n_splits = 3
max_train_size = len(train_x) // (n_splits+1)

In [8]:
train_x.shape

(373855, 17)

In [9]:
test_x.shape

(236380, 17)

# Grid Search

### xgboost

In [11]:
%%time
model_xg = xgb.XGBRegressor()
grid_search_xg = GridSearchCV(model_xg, param_grid=params, cv=TimeSeriesSplit(n_splits=n_splits, max_train_size=max_train_size), verbose=1, n_jobs=1)
grid_search_xg.fit(train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 34.4min finished


CPU times: user 35min 56s, sys: 15.7 s, total: 36min 12s
Wall time: 38min 1s


### lightgbm

In [12]:
%%time
model_lg = lgbm.LGBMRegressor()
grid_search_lg = GridSearchCV(model_lg, param_grid=params, cv=TimeSeriesSplit(n_splits=n_splits, max_train_size=max_train_size), verbose=1, n_jobs=1)
grid_search_lg.fit(train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:  4.0min finished


CPU times: user 3min 55s, sys: 4.46 s, total: 4min
Wall time: 4min 6s


### catboost

In [13]:
%%time
model_cb = cb.CatBoostRegressor(verbose=False)
grid_search_cb = GridSearchCV(model_cb, param_grid=params_cb, cv=TimeSeriesSplit(n_splits=n_splits, max_train_size=max_train_size), verbose=1, n_jobs=1)
grid_search_cb.fit(train_x, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed: 31.8min finished


CPU times: user 50min 41s, sys: 24min 51s, total: 1h 15min 32s
Wall time: 32min


## Predictions

In [23]:
predict_y_xg = grid_search_xg.predict(test_x)
predict_y_lg = grid_search_lg.predict(test_x)
predict_y_cb = grid_search_cb.predict(test_x)

print('XGBoost R2 Score: {}'.format(r2_score(test_y, predict_y_xg)))
print('LightGBM R2 Score: {}'.format(r2_score(test_y, predict_y_lg)))
print('CatBoost R2 Score: {}\n'.format(r2_score(test_y, predict_y_cb)))

print('XGBoost RMSE Score: {}'.format(math.sqrt(mean_squared_error(test_y, predict_y_xg))))
print('LightGBM RMSE Score: {}'.format(math.sqrt(mean_squared_error(test_y, predict_y_lg))))
print('CatBoost RMSE Score: {}\n'.format(math.sqrt(mean_squared_error(test_y, predict_y_cb))))

print('XGBoost MAE Score: {}'.format(mean_absolute_error(test_y, predict_y_xg)))
print('LightGBM MAE Score: {}'.format(mean_absolute_error(test_y, predict_y_lg)))
print('CatBoost MAE Score: {}\n'.format(mean_absolute_error(test_y, predict_y_cb)))

XGBoost R2 Score: 0.9845042161602857
LightGBM R2 Score: 0.9593680458739346
CatBoost R2 Score: 0.3924172059353608

XGBoost RMSE Score: 478.0346380049253
LightGBM RMSE Score: 774.0810688834093
CatBoost RMSE Score: 2993.335008461103

XGBoost MAE Score: 304.35978959594553
LightGBM MAE Score: 524.8717335935854
CatBoost MAE Score: 2274.958012699857



# Randomized Search

In [15]:
params = {'max_depth': st.randint(3, 16),
          'learning_rate': st.uniform(0.01, 0.4),
          'n_estimators': st.randint(100, 500)
         }

params_cb = {'depth': st.randint(3,16),
            'learning_rate': st.uniform(0.01, 0.4),
            'iterations': st.randint(100, 500)
            }

### xgboost

In [16]:
%%time
model_xg = xgb.XGBRegressor()
rand_search_xg = RandomizedSearchCV(model_xg, params, n_jobs=1, cv=TimeSeriesSplit(n_splits=n_splits, max_train_size=max_train_size), n_iter=5, verbose=1)
rand_search_xg.fit(train_x, train_y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 24.8min finished


CPU times: user 27min 21s, sys: 11.2 s, total: 27min 32s
Wall time: 28min 42s


### lightgbm

In [17]:
%%time
model_lg = lgbm.LGBMRegressor()
rand_search_lg = RandomizedSearchCV(model_lg, params, n_jobs=1, cv=TimeSeriesSplit(n_splits=n_splits, max_train_size=max_train_size), n_iter=5, verbose=1)
rand_search_lg.fit(train_x, train_y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  1.7min finished


CPU times: user 1min 46s, sys: 1.91 s, total: 1min 48s
Wall time: 1min 55s


### catboost

In [19]:
%%time
model_cb = cb.CatBoostRegressor(verbose=False)
rand_search_cb = RandomizedSearchCV(model_cb, params_cb, n_jobs=1, cv=TimeSeriesSplit(n_splits=n_splits, max_train_size=max_train_size), n_iter=5, verbose=1)
rand_search_cb.fit(train_x, train_y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  9.0min finished


CPU times: user 20min 16s, sys: 5min 5s, total: 25min 22s
Wall time: 10min 42s


In [22]:
predict_y_xg = rand_search_xg.predict(test_x)
predict_y_lg = rand_search_lg.predict(test_x)
predict_y_cb = rand_search_cb.predict(test_x)

print('XGBoost R2 Score: {}'.format(r2_score(test_y, predict_y_xg)))
print('LightGBM R2 Score: {}'.format(r2_score(test_y, predict_y_lg)))
print('CatBoost R2 Score: {}\n'.format(r2_score(test_y, predict_y_cb)))

print('XGBoost RMSE Score: {}'.format(math.sqrt(mean_squared_error(test_y, predict_y_xg))))
print('LightGBM RMSE Score: {}'.format(math.sqrt(mean_squared_error(test_y, predict_y_lg))))
print('CatBoost RMSE Score: {}\n'.format(math.sqrt(mean_squared_error(test_y, predict_y_cb))))

print('XGBoost MAE Score: {}'.format(mean_absolute_error(test_y, predict_y_xg)))
print('LightGBM MAE Score: {}'.format(mean_absolute_error(test_y, predict_y_lg)))
print('CatBoost MAE Score: {}\n'.format(mean_absolute_error(test_y, predict_y_cb)))

XGBoost R2 Score: 0.9840236061052743
LightGBM R2 Score: 0.9719304142665414
CatBoost R2 Score: 0.963735923696809

XGBoost RMSE Score: 485.39128148570666
LightGBM RMSE Score: 643.3844988655625
CatBoost RMSE Score: 731.2921331651366

XGBoost MAE Score: 303.40760768053696
LightGBM MAE Score: 442.83291006743093
CatBoost MAE Score: 491.7908597475108



# Early Stopping

### xgboost

In [24]:
train_xg = xgb.DMatrix(train_x, train_y)
params_xg = {"objective": 'reg:linear',
             'eval_metric': 'rmse'
}
validation_xg = xgb.DMatrix(test_x, test_y)
watchlist  = [(train_xg,'train'),(validation_xg,'eval')]

In [27]:
%%time
model_xg = xgb.train(params_xg, train_xg, num_boost_round=9999, evals=watchlist, early_stopping_rounds=10, verbose_eval=10)

[0]	train-rmse:4987.38	eval-rmse:5045.6
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 10 rounds.
[10]	train-rmse:930.629	eval-rmse:989.363
[20]	train-rmse:803.924	eval-rmse:848.51
[30]	train-rmse:723.601	eval-rmse:769.458
[40]	train-rmse:662.337	eval-rmse:708.054
[50]	train-rmse:611.971	eval-rmse:661.209
[60]	train-rmse:576.525	eval-rmse:629.574
[70]	train-rmse:555.81	eval-rmse:610.509
[80]	train-rmse:535.494	eval-rmse:590.706
[90]	train-rmse:522.09	eval-rmse:579.964
[100]	train-rmse:506.527	eval-rmse:565.162
[110]	train-rmse:494.181	eval-rmse:555.372
[120]	train-rmse:482.999	eval-rmse:545.271
[130]	train-rmse:474.449	eval-rmse:538.903
[140]	train-rmse:467.323	eval-rmse:532.778
[150]	train-rmse:460.324	eval-rmse:528.181
[160]	train-rmse:454.31	eval-rmse:523.906
[170]	train-rmse:445.942	eval-rmse:517.821
[180]	train-rmse:440.297	eval-rmse:514.189
[190]	train-rmse:436.293	eval-rmse:512.824
[200]	train-r

### lightgbm

In [28]:
train_lg = lgbm.Dataset(train_x, train_y)
params_lg = {'task': 'train',
            'objective': 'regression',
            'metric': 'rmse'}
validation_lg = lgbm.Dataset(test_x, test_y, reference=train_lg)

In [29]:
%%time
model_lg = lgbm.train(params_lg, train_lg, valid_sets=validation_lg, num_boost_round=9999, early_stopping_rounds=10, verbose_eval=10)

Training until validation scores don't improve for 10 rounds.
[10]	valid_0's rmse: 3195.7
[20]	valid_0's rmse: 2650.39
[30]	valid_0's rmse: 2184.9
[40]	valid_0's rmse: 1805.55
[50]	valid_0's rmse: 1522.29
[60]	valid_0's rmse: 1366.67
[70]	valid_0's rmse: 1262.68
[80]	valid_0's rmse: 1192.87
[90]	valid_0's rmse: 1137.61
[100]	valid_0's rmse: 1090.43
[110]	valid_0's rmse: 1055.47
[120]	valid_0's rmse: 1025.11
[130]	valid_0's rmse: 994.572
[140]	valid_0's rmse: 970.561
[150]	valid_0's rmse: 950.721
[160]	valid_0's rmse: 929.36
[170]	valid_0's rmse: 907.945
[180]	valid_0's rmse: 891.272
[190]	valid_0's rmse: 872.588
[200]	valid_0's rmse: 853.214
[210]	valid_0's rmse: 838.696
[220]	valid_0's rmse: 825.127
[230]	valid_0's rmse: 810.659
[240]	valid_0's rmse: 798.012
[250]	valid_0's rmse: 781.202
[260]	valid_0's rmse: 765.121
[270]	valid_0's rmse: 751.779
[280]	valid_0's rmse: 736.216
[290]	valid_0's rmse: 725.488
[300]	valid_0's rmse: 711.786
[310]	valid_0's rmse: 699.94
[320]	valid_0's rmse:

### catboost

In [33]:
train_cb = cb.Pool(train_x, train_y)
params_cb = {'eval_metric': 'RMSE',
            'od_type': 'Iter',
             'iterations': 9999,
            'od_wait': 10,
            'use_best_model': True,
            'verbose': True,
            'metric_period': 10}
validation_cb = cb.Pool(test_x, test_y)

In [35]:
%%time
model_cb = cb.train(train_cb, params_cb, eval_set=validation_cb)

0:	learn: 6808.5689370	test: 6825.4965985	best: 6825.4965985 (0)	total: 171ms	remaining: 28m 30s
10:	learn: 5124.2334305	test: 5174.5227115	best: 5174.5227115 (10)	total: 1.54s	remaining: 23m 23s
20:	learn: 3900.9914280	test: 3973.3945981	best: 3973.3945981 (20)	total: 2.86s	remaining: 22m 37s
30:	learn: 3020.9756988	test: 3108.4337596	best: 3108.4337596 (30)	total: 4.22s	remaining: 22m 36s
40:	learn: 2396.1067380	test: 2493.1494135	best: 2493.1494135 (40)	total: 6.34s	remaining: 25m 39s
50:	learn: 1954.7600308	test: 2054.7789450	best: 2054.7789450 (50)	total: 8.34s	remaining: 27m 6s
60:	learn: 1653.1360472	test: 1749.7227026	best: 1749.7227026 (60)	total: 9.84s	remaining: 26m 43s
70:	learn: 1449.4264552	test: 1539.7236115	best: 1539.7236115 (70)	total: 11.3s	remaining: 26m 17s
80:	learn: 1314.5819511	test: 1395.6599879	best: 1395.6599879 (80)	total: 12.6s	remaining: 25m 45s
90:	learn: 1223.4455743	test: 1296.1249770	best: 1296.1249770 (90)	total: 14.2s	remaining: 25m 40s
100:	learn: 1

840:	learn: 747.0514349	test: 781.2266553	best: 781.2266553 (840)	total: 1m 57s	remaining: 21m 15s
850:	learn: 745.0495937	test: 779.1704437	best: 779.1704437 (850)	total: 1m 58s	remaining: 21m 13s
860:	learn: 742.7602236	test: 776.9232446	best: 776.9232446 (860)	total: 1m 59s	remaining: 21m 11s
870:	learn: 740.5004813	test: 774.7673216	best: 774.7673216 (870)	total: 2m 1s	remaining: 21m 9s
880:	learn: 738.1533387	test: 772.5196296	best: 772.5196296 (880)	total: 2m 2s	remaining: 21m 6s
890:	learn: 736.0227063	test: 770.3733704	best: 770.3733704 (890)	total: 2m 3s	remaining: 21m 4s
900:	learn: 733.8174695	test: 768.3759670	best: 768.3759670 (900)	total: 2m 5s	remaining: 21m 2s
910:	learn: 731.3765413	test: 766.0285132	best: 766.0285132 (910)	total: 2m 6s	remaining: 21m
920:	learn: 729.6430777	test: 764.3623789	best: 764.3623789 (920)	total: 2m 7s	remaining: 20m 58s
930:	learn: 727.7488258	test: 762.6307045	best: 762.6307045 (930)	total: 2m 8s	remaining: 20m 56s
940:	learn: 725.7466643	t

1660:	learn: 626.3721908	test: 663.4519778	best: 663.4519778 (1660)	total: 3m 47s	remaining: 19m 2s
1670:	learn: 625.3221724	test: 662.4582136	best: 662.4582136 (1670)	total: 3m 48s	remaining: 19m
1680:	learn: 624.3475536	test: 661.4701340	best: 661.4701340 (1680)	total: 3m 50s	remaining: 18m 59s
1690:	learn: 623.4834435	test: 660.6341630	best: 660.6341630 (1690)	total: 3m 51s	remaining: 18m 57s
1700:	learn: 622.3849876	test: 659.6037808	best: 659.6037808 (1700)	total: 3m 52s	remaining: 18m 55s
1710:	learn: 621.4149818	test: 658.5499723	best: 658.5499723 (1710)	total: 3m 54s	remaining: 18m 54s
1720:	learn: 620.7049178	test: 657.9651616	best: 657.9651616 (1720)	total: 3m 55s	remaining: 18m 52s
1730:	learn: 619.6190881	test: 656.8495542	best: 656.8495542 (1730)	total: 3m 56s	remaining: 18m 50s
1740:	learn: 618.7917295	test: 655.9342365	best: 655.9342365 (1740)	total: 3m 58s	remaining: 18m 49s
1750:	learn: 618.0497305	test: 655.2685423	best: 655.2685423 (1750)	total: 3m 59s	remaining: 18m

2480:	learn: 570.0863455	test: 610.7320685	best: 610.7320685 (2480)	total: 5m 35s	remaining: 16m 56s
2490:	learn: 569.6219031	test: 610.2968213	best: 610.2968213 (2490)	total: 5m 36s	remaining: 16m 54s
2500:	learn: 569.2565860	test: 609.9693176	best: 609.9693176 (2500)	total: 5m 38s	remaining: 16m 53s
2510:	learn: 568.8745443	test: 609.6826241	best: 609.6826241 (2510)	total: 5m 39s	remaining: 16m 51s
2520:	learn: 568.4778801	test: 609.4382844	best: 609.4382844 (2520)	total: 5m 40s	remaining: 16m 50s
2530:	learn: 568.0230529	test: 609.0676271	best: 609.0676271 (2530)	total: 5m 41s	remaining: 16m 49s
2540:	learn: 567.6371866	test: 608.6853816	best: 608.6853816 (2540)	total: 5m 43s	remaining: 16m 47s
2550:	learn: 567.1725244	test: 608.2679871	best: 608.2679871 (2550)	total: 5m 44s	remaining: 16m 46s
2560:	learn: 566.7631383	test: 607.8678522	best: 607.8678522 (2560)	total: 5m 45s	remaining: 16m 44s
2570:	learn: 566.0706135	test: 607.1799417	best: 607.1799417 (2570)	total: 5m 47s	remaining

3300:	learn: 538.4463637	test: 583.1018523	best: 583.1018523 (3300)	total: 7m 23s	remaining: 14m 59s
3310:	learn: 538.1911259	test: 582.9212773	best: 582.9212773 (3310)	total: 7m 24s	remaining: 14m 57s
3320:	learn: 537.9682579	test: 582.8819108	best: 582.8215136 (3312)	total: 7m 25s	remaining: 14m 56s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 582.8215136
bestIteration = 3312

Shrink model to first 3313 iterations.
CPU times: user 18min 25s, sys: 1min 52s, total: 20min 18s
Wall time: 7min 33s
