In [None]:
#from google.colab import drive
#drive.mount('/content/drive/')

In [None]:
#!pip install catboost --upgrade

## Requirements

In [1]:
import numpy as np
import pandas as pd
import pickle

from catboost import Pool, CatBoostRegressor

## Params

In [2]:
SUBMISSION_TYPE = 'validation' # 'evaluation'
DATA_PATH = '../data/' # '/content/drive/My Drive/'

if SUBMISSION_TYPE == 'validation':
    END_TRAIN = 1913
    END_VALID = 1941
elif SUBMISSION_TYPE == 'evaluation':
    END_TRAIN = 1941
    END_VALID = 1969

## Load data

In [3]:
df = pd.read_pickle(DATA_PATH + '/refined/top_down_df.pkl')

# Split train & valid df

In [5]:
train_df = df[df['d'] <= END_TRAIN]
valid_df = df[(df['d'] > END_TRAIN) & (df['d'] <= END_VALID)]

del df

In [11]:
train_df['sales_ratio'].astype(float21)

0           0.000000
1           0.002012
2           0.000000
3           0.004025
4           0.000000
              ...   
33183743    0.000448
33183744    0.000000
33183745    0.000000
33183746    0.001344
33183747    0.000000
Name: sales_ratio, Length: 31522396, dtype: float16

In [8]:
train_df.describe()['sales_ratio']

count    3.152240e+07
mean              NaN
std      0.000000e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      2.243042e-03
max      1.000000e+00
Name: sales_ratio, dtype: float64

## Prepare Pool params

In [None]:
def process_data(df):
    '''List data type (category vs numeric) & convert all non-category data to float32'''
    cf = []
    for col in df:
        if str(df[col].dtype) == 'category':
            cf.append(col)
        else:
            df[col] = df[col].astype(np.float32)
    return df, cf

In [None]:
data_train, cat_features_train = process_data(train_df.drop(columns=['id', 'sales', 'sales_ratio']))
data_valid, cat_features_valid = process_data(valid_df.drop(columns=['id', 'sales', 'sales_ratio']))

label_train = np.array(train_df['sales_ratio'], dtype=np.float32)

output_id = valid_df['id'].unique()

## Create Catboost Pools 

In [None]:
train_pool = Pool(
    data=data_train,
    label=label_train,
    cat_features=cat_features_train
)

valid_pool = Pool(
    data=data_valid,
    cat_features=cat_features_valid
)

del data_train, label_train, cat_features_train
del data_valid, cat_features_valid

## Catboost Params

In [None]:
catboost_params = {
    'depth': 15,
    'learning_rate': 0.6647197585933009,
    'l2_leaf_reg': 0.9817791173327812,
    'bootstrap_type': 'No',
    'border_count': 218,
    'grow_policy': 'SymmetricTree',
    'eval_metric': 'RMSE',
    'use_best_model': False,
    'task_type': 'GPU',
    'random_seed': 666,
    'verbose': 10,
    'loss_function': 'RMSE',
    'iterations': 300,
    'early_stopping_rounds': 20
}

cbr = CatBoostRegressor(**catboost_params)

## Fit & Predict

In [None]:
cbr.fit(train_pool)

In [None]:
forecast = cbr.predict(valid_pool)

## Export

In [None]:
valid_df['sales_ratio'] = forecast # Overwrite with forecast
res = valid_df[['id', 'dept_id', 'store_id', 'd', 'sales_ratio']]
res.to_csv(DATA_PATH + '/refined/top_down_ratio_' + SUBMISSION_TYPE + '.csv', index=False)