In [None]:
import numpy as np
import pandas as pd 
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from catboost import CatBoostRegressor
import lightgbm as lgb

In [None]:
# import finished dataset
train_merge = pd.read_pickle(os.path.join(os.path.dirname(__file__), '../data/train_merge.pkl')
train_merge.info()

In [None]:
# split dataset
X_train = train_merge[train_merge.date_block_num < 33].drop(['item_cnt_month'], axis=1)
y_train = train_merge[train_merge.date_block_num < 33]['item_cnt_month']
X_valid = train_merge[train_merge.date_block_num == 33].drop(['item_cnt_month'], axis=1)
y_valid = train_merge[train_merge.date_block_num == 33]['item_cnt_month']
X_test = train_merge[train_merge.date_block_num == 34].drop(['item_cnt_month'], axis=1)
print('Shape X_train: {}'.format(X_train.shape))
print()
print('Shape y_train: {}'.format(y_train.shape))
print()
print('Shape X_valid: {}'.format(X_valid.shape))
print()
print('Shape y_valid: {}'.format(y_valid.shape))
print()
print('Shape X_test: {}'.format(X_test.shape))

In [None]:
# catboost grid search
# model and params
cat_features = ['country_part', 
                'item_category_common',
                'item_category_id', 
                'city_code']

catboost = CatBoostRegressor(random_state=1, 
                             iterations=2000, verbose=200,
                             loss_function='RMSE', eval_metric='RMSE',
                             task_type='GPU',early_stopping_rounds=30,
                             grow_policy='Lossguide', bootstrap_type='Poisson',
                            cat_features=cat_features)

parameters_cb = {'learning_rate': [0.01, 0.05, 0.1],
        'depth': [4, 8, 10],
        'l2_leaf_reg': [3, 7, 9],
        'subsample': [0.2, 0.7, 1],
        'max_leaves':[31, 1023, 2047],
        'min_data_in_leaf':[1, 5, 10, 15]

In [None]:
# define and fit model
search_cb = RandomizedSearchCV(catboost, parameters_cb, cv=3, scoring = 'neg_mean_squared_error', random_state = 1)
search_cb.fit(X_train, y_train)
best_cb = search_cb.best_estimator_

In [None]:
best_cb.get_params()

Get parameters:
{'iterations': 2000, 
 'learning_rate': 0.01,
 'depth': 4,
 'l2_leaf_reg': 7,
 'loss_function': 'RMSE',
 'verbose': 200,
 'eval_metric': 'RMSE',
 'task_type': 'GPU',
 'bootstrap_type': 'Poisson',
 'subsample': 0.7,
 'random_state': 1,
 'early_stopping_rounds': 30,
 'cat_features': ['country_part',
  'item_category_common',
  'item_category_id',
  'city_code'],
 'grow_policy': 'Lossguide',
 'min_data_in_leaf': 1,
 'max_leaves': 2047}

In [None]:
# lightgbm grid search
# model and params
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_valid, y_valid)
lightgbm = lgb.LGBMRegressor(random_state=1, objective='rmse', metric='rmse')
parameters_lgb = {'learning_rate': [0.01, 0.05, 0.1],
        'depth': [4, 8, 10],
        'reg_lambda': [0, 0.5, 9],
        'subsample': [0.2, 0.7, 1],
        'num_leaves':[31, 511, 1023, 2047],
        'min_child_samples':[10, 20, 30],
        'n_estimators':range(50, 301, 50)
        }

In [None]:
# define and fit model
search_lgb = RandomizedSearchCV(lightgbm, parameters_lgb, cv=3, scoring = 'neg_mean_squared_error', random_state = 1)
search_lgb.fit(X_train, y_train)
best_lgb = search_lgb.best_estimator_

params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'num_leaves': 1023,
    'min_data_in_leaf':10,
    'feature_fraction': 0.7,
    'learning_rate': 0.01,
    'num_rounds': 2000,
    'early_stopping_rounds': 30,
    'seed': 1
}

In [None]:
# random forest grid search
# model and params
rf_rs = RandomForestRegressor(random_state = 1)
parameters_rf = {'n_estimators': range(25, 126, 25), 
                 'max_depth':range(4, 11), 
                 'min_samples_split': range(1, 12, 2), 
                 'min_samples_leaf': range(1, 12, 2), 
                 'max_features':['auto', 'log2', 'sqrt']}

In [None]:
# define and fit model
search_rf = RandomizedSearchCV(rf_rs, parameters_rf, cv=3, scoring = 'neg_mean_squared_error', n_jobs = -1, random_state = 1)
search_rf.fit(X_train, y_train)
best_rf = search_rf.best_estimator_

RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=7,
                      min_samples_split=11, n_estimators=75, random_state=1)