In [142]:
import pandas as pd
import numpy as np
import warnings
from path_utils import sales_fix_path, test_fix_path
import catboost as cb
from src.features.build_features import create_final_data
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
import time
import math

warnings.filterwarnings("ignore")

In [143]:
train = pd.read_csv(sales_fix_path)
test = pd.read_csv(test_fix_path)

In [144]:
train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,revenue
0,02.01.2013,0,59,22154,999.00,1.0,999.00
1,03.01.2013,0,25,2552,899.00,1.0,899.00
2,06.01.2013,0,25,2554,1709.05,1.0,1709.05
3,15.01.2013,0,25,2555,1099.00,1.0,1099.00
4,10.01.2013,0,25,2564,349.00,1.0,349.00
...,...,...,...,...,...,...,...
2583016,10.10.2015,33,25,7409,299.00,1.0,299.00
2583017,09.10.2015,33,25,7460,299.00,1.0,299.00
2583018,14.10.2015,33,25,7459,349.00,1.0,349.00
2583019,22.10.2015,33,25,7440,299.00,1.0,299.00


In [145]:
df = create_final_data(train, test)

In [146]:
train = df[~df['date_block_num'].isin([34])]
test = df[df['date_block_num'].isin([34])]

In [147]:
train

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_revenue_month
0,0,2,27,1.0,2499.0
1,0,2,33,1.0,499.0
2,0,2,317,1.0,299.0
3,0,2,438,1.0,299.0
4,0,2,471,2.0,798.0
...,...,...,...,...,...
1536102,33,59,22087,1.0,119.0
1536103,33,59,22088,2.0,238.0
1536104,33,59,22091,1.0,179.0
1536105,33,59,22100,1.0,629.0


In [175]:
in_features = ['date_block_num', 'shop_id', 'item_id', 'item_revenue_month']
target = ['item_cnt_month']
cat_cols = ['date_block_num', 'shop_id', 'item_id']
n_fold = 33

In [176]:
tscv = TimeSeriesSplit(n_splits=n_fold, test_size=1, gap=0)

In [178]:

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '{}m {}s'.format(round(m), round(s))


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return 'Total time: [{:5}]'.format(asMinutes(s))

In [179]:
def train_model():
    template = 'Fold: [{:2} out of {:2}]\tR2-score: [{:3.3f}]\tRMSE: [{:3.3f}]\tIter time: [{:5}]\t{}'

    start_time = time.time()

    for idx, (train_index, val_index) in enumerate(tscv.split(train['date_block_num'].unique())):
        start_iter = time.time()
        train_df = train[train['date_block_num'].isin(train_index)]
        val_df = train[train['date_block_num'].isin(val_index)]

        train_data = cb.Pool(train_df[in_features], train_df[target], cat_features=cat_cols)
        val_data = cb.Pool(val_df[in_features], val_df[target], cat_features=cat_cols)

        model = cb.CatBoostRegressor(cat_features=cat_cols, task_type="GPU", random_seed=42)
        model.fit(train_data, eval_set=val_data, use_best_model=True, verbose=False)
        preds = model.predict(val_df[in_features])

        rmse = (np.sqrt(mean_squared_error(val_df[target], preds)))
        r2 = r2_score(val_df[target], preds)

        print(template.format(idx + 1, n_fold, r2, rmse, asMinutes(time.time() - start_iter),
                              timeSince(start_time, idx + 1 / n_fold)))

In [180]:
train_model()

Fold: [ 1 out of 33]	R2-score: [0.776]	RMSE: [0.782]	Iter time: [0m 10s]	Total time: [0m 10s]
Fold: [ 2 out of 33]	R2-score: [0.731]	RMSE: [0.951]	Iter time: [0m 15s]	Total time: [0m 25s]
Fold: [ 3 out of 33]	R2-score: [0.771]	RMSE: [0.768]	Iter time: [0m 20s]	Total time: [0m 44s]
Fold: [ 4 out of 33]	R2-score: [0.806]	RMSE: [0.703]	Iter time: [0m 26s]	Total time: [1m 10s]
Fold: [ 5 out of 33]	R2-score: [0.789]	RMSE: [0.750]	Iter time: [0m 31s]	Total time: [1m 42s]
Fold: [ 6 out of 33]	R2-score: [0.828]	RMSE: [0.653]	Iter time: [0m 39s]	Total time: [2m 21s]
Fold: [ 7 out of 33]	R2-score: [0.764]	RMSE: [0.830]	Iter time: [0m 46s]	Total time: [3m 7s]
Fold: [ 8 out of 33]	R2-score: [0.768]	RMSE: [0.811]	Iter time: [0m 50s]	Total time: [3m 57s]
Fold: [ 9 out of 33]	R2-score: [0.792]	RMSE: [0.777]	Iter time: [0m 57s]	Total time: [4m 53s]
Fold: [10 out of 33]	R2-score: [0.811]	RMSE: [0.722]	Iter time: [1m 3s]	Total time: [5m 57s]
Fold: [11 out of 33]	R2-score: [0.781]	RMSE: [0.950]	Iter time

KeyboardInterrupt: 