In [93]:
import pandas as pd
import numpy as np
import warnings
from path_utils import sales_fix_path, test_fix_path, final_data_path, save_final_to
import catboost as cb
from src.features.build_features import FeatureExtraction
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
import time
from src.features.utils import asMinutes

warnings.filterwarnings("ignore")

In [112]:
in_features = ['date_block_num', 'shop_id', 'item_id']
target = ['item_cnt_month']
cat_cols = ['date_block_num', 'shop_id', 'item_id']
n_fold = 33

In [94]:
train = pd.read_csv(sales_fix_path)
test = pd.read_csv(test_fix_path)

In [95]:
train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,revenue
0,02.01.2013,0,59,22154,999.00,1.0,999.00
1,03.01.2013,0,25,2552,899.00,1.0,899.00
2,06.01.2013,0,25,2554,1709.05,1.0,1709.05
3,15.01.2013,0,25,2555,1099.00,1.0,1099.00
4,10.01.2013,0,25,2564,349.00,1.0,349.00
...,...,...,...,...,...,...,...
2564451,10.10.2015,33,25,7409,299.00,1.0,299.00
2564452,09.10.2015,33,25,7460,299.00,1.0,299.00
2564453,14.10.2015,33,25,7459,349.00,1.0,349.00
2564454,22.10.2015,33,25,7440,299.00,1.0,299.00


In [96]:
fe_simple = FeatureExtraction()

In [97]:
fe_simple.create_final_data(train, test, make_big=False)

In [98]:
df = fe_simple.get_data()

In [99]:
train = df[~df['date_block_num'].isin([34])]
test = df[df['date_block_num'].isin([34])]

In [101]:
train

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_revenue_month
0,0,2,27,1.0,2499.0
1,0,2,33,1.0,499.0
2,0,2,317,1.0,299.0
3,0,2,438,1.0,299.0
4,0,2,471,2.0,798.0
...,...,...,...,...,...
1527272,33,59,22087,1.0,119.0
1527273,33,59,22088,2.0,238.0
1527274,33,59,22091,1.0,179.0
1527275,33,59,22100,1.0,629.0


In [103]:
# tscv = TimeSeriesSplit(n_splits=n_fold, test_size=1, gap=0)

In [104]:
# def train_cv_model():
#     template = 'Fold: [{:2} out of {:2}]\tR2-score: [{:3.3f}]\tRMSE: [{:3.3f}]\tIter time: [{:5}]\tTotal time: [{:5}]'
# 
#     start_time = time.time()
# 
#     for idx, (train_index, val_index) in enumerate(tscv.split(train['date_block_num'].unique())):
#         start_iter = time.time()
#         train_df = train[train['date_block_num'].isin(train_index)]
#         val_df = train[train['date_block_num'].isin(val_index)]
# 
#         train_data = cb.Pool(train_df[in_features], train_df[target], cat_features=cat_cols)
#         val_data = cb.Pool(val_df[in_features], val_df[target], cat_features=cat_cols)
# 
#         model = cb.CatBoostRegressor(cat_features=cat_cols, task_type="GPU", random_seed=42)
#         model.fit(train_data, eval_set=val_data, use_best_model=True, verbose=False)
#         preds = model.predict(val_df[in_features])
# 
#         rmse = (np.sqrt(mean_squared_error(val_df[target], preds)))
#         r2 = r2_score(val_df[target], preds)
# 
#         print(template.format(idx + 1, n_fold, r2, rmse, asMinutes(time.time() - start_iter),
#                               asMinutes(time.time() - start_time)))

In [105]:
# train_cv_model()

In [106]:
def train_model(train: pd.DataFrame, test: pd.DataFrame) -> list:
    train_data = cb.Pool(train[train['date_block_num'] < 33][in_features],
                         train[train['date_block_num'] < 33][target],
                         cat_features=cat_cols)
    val_data = cb.Pool(train[train['date_block_num'] == 33][in_features],
                       train[train['date_block_num'] == 33][target],
                       cat_features=cat_cols)

    model = cb.CatBoostRegressor(cat_features=cat_cols, task_type="GPU", random_seed=42)
    model.fit(train_data, eval_set=val_data, use_best_model=True, verbose=True, early_stopping_rounds=50)
    return model.predict(test[in_features])


In [ ]:
preds = train_model(train, test)

In [75]:
kaggle = pd.DataFrame({'item_cnt_month': preds})

In [76]:
kaggle.to_csv(save_final_to + 'kaggle' + '.csv',
              index=True, index_label="ID")



**Without any Feature Extraction methods and option to include all possible combination date_block_num, shop_id, and item_id if make_big, public kaggle score is 1.52 (what a fuck amigo)**


<img src="../data/images/kaggle_score_1.png">


In [114]:
train = pd.read_csv(sales_fix_path)
test = pd.read_csv(test_fix_path)

In [115]:
fe_with_comb = FeatureExtraction()

In [116]:
fe_with_comb.create_final_data(train=train, test=test, make_big=True)

In [117]:
df = fe_with_comb.get_data()

In [118]:
train = df[~df['date_block_num'].isin([34])]
test = df[df['date_block_num'].isin([34])]

In [ ]:
preds = train_model(train, test)

In [ ]:
kaggle = pd.DataFrame({'item_cnt_month':preds})

In [ ]:
kaggle.to_csv(save_final_to + 'kaggle' + '.csv',
              index=True, index_label="ID")



**Without Feature Extraction methods but using option to include all possible combination date_block_num, shop_id, and item_id if make_big, public kaggle score is 1.16 (what a fuck amigo)**


<img src="../data/images/kaggle_score_2.png">
