In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt 

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

import joblib

In [2]:
# import dataset from csv file
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')

In [8]:
# drop stocks
train = train.drop(['udsStock'], axis=1)
test = test.drop(['udsStock'], axis=1)

In [9]:
# use gridsearchcv with TimeSeriesSplit
param_grid = {'max_depth': [5, 10, 20],
                'n_estimators': [100, 300, 500],
                'colsample_bytree': [0.5, 1]}

In [10]:
best_results = pd.DataFrame(columns=['idSku', 'max_depth', 'n_estimators', 'colsample_bytree', 'rmse'])

In [None]:
skus = train['idSku'].unique()
for s in skus:
    train_s = train[train['idSku'] == s]
    y_train = train_s['udsVenta']
    X_train = train_s.drop(['udsVenta', 'idSku', 'idSecuencia'], axis=1)

    xgbr = xgb.XGBRegressor(verbosity=0)
    tscv = TimeSeriesSplit()
    grid_search = GridSearchCV(xgbr, param_grid, cv=tscv, scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    best_results = best_results.append({'idSku': s, 'max_depth': best_params['max_depth'], 'n_estimators': best_params['n_estimators'], 'colsample_bytree': best_params['colsample_bytree'], 'rmse': -grid_search.best_score_}, ignore_index=True)

    # train model with best params
    best_xgbr = xgb.XGBRegressor(max_depth=best_params['max_depth'], n_estimators=best_params['n_estimators'], colsample_bytree=best_params['colsample_bytree'], verbosity=0)
    best_xgbr.fit(X_train, y_train)

    # save model
    path = '../XGBR_models/xgbr_' + str(s) + '.pkl'
    joblib.dump(best_xgbr, path)

In [12]:
best_results.head(50)

Unnamed: 0,idSku,max_depth,n_estimators,colsample_bytree,rmse
0,1.0,5.0,100.0,0.5,9.473333
1,2.0,20.0,100.0,0.5,9.810401
2,3.0,10.0,300.0,1.0,7.89132
3,4.0,10.0,100.0,1.0,7.28884
4,5.0,20.0,300.0,1.0,9.085682
5,6.0,5.0,300.0,1.0,7.879325
6,7.0,20.0,300.0,1.0,6.222731
7,8.0,20.0,100.0,0.5,5.546147
8,9.0,10.0,100.0,0.5,6.240968
9,10.0,5.0,300.0,0.5,5.00344


In [None]:
# save the best results
best_results.to_csv('../Data/XGBR_best_models.csv', index=False)