In [None]:
#from google.colab import drive
#drive.mount('/content/drive/')

In [None]:
#!pip install catboost optuna --upgrade

## Requirements

In [1]:
import numpy as np
import pandas as pd
import pickle
import pprint

from catboost import Pool, CatBoostRegressor
import optuna

## Params

In [2]:
TUNING_TYPE = 'validation' # 'evaluation'
DATA_PATH = '../data/' # '/content/drive/My Drive/'

if TUNING_TYPE == 'validation': # Tuning for validation (1914 --> 1941) using past month to validate
    END_TRAIN = 1885
    END_VALID = 1913
elif TUNING_TYPE == 'evaluation': # Tuning for evaluation (1941 --> 1969) using past month to validate
    END_TRAIN = 1913
    END_VALID = 1941

## Load data

In [25]:
df = pd.read_pickle(DATA_PATH + 'refined/top_down_df.pkl')

In [26]:
a = df[df['d'] <= END_VALID].copy()
a['sales_ratio'] = a['sales_ratio'].astype(np.float32) 
a.describe()[['sales_ratio']]

Unnamed: 0,sales_ratio
count,31522400.0
mean,0.002467637
std,0.007731112
min,0.0
25%,0.0
50%,0.0
75%,0.002243042
max,1.0


## Split train & valid df

In [4]:
train_df = df[df['d'] <= END_TRAIN]
valid_df = df[(df['d'] > END_TRAIN) & (df['d'] <= END_VALID)]

del df

In [14]:
a = train_df[train_df.d >= 1885 - 27]
a.describe()[['sales_ratio']]

In [19]:
valid_df.describe()[['sales_ratio']]

Unnamed: 0,sales_ratio
count,853720.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.002319
max,0.688965


## Prepare Pool params

In [None]:
def process_data(df):
    '''List data type (category vs numeric) & convert all non-category data to float32'''
    cf = []
    for col in df:
        if str(df[col].dtype) == 'category':
            cf.append(col)
        else:
            df[col] = df[col].astype(np.float32)
    return df, cf

In [None]:
data_train, cat_features_train = process_data(train_df.drop(columns=['id', 'sales', 'sales_ratio']))
data_valid, cat_features_valid = process_data(valid_df.drop(columns=['id', 'sales', 'sales_ratio']))

label_train = np.array(train_df['sales_ratio'], dtype=np.float32)
label_valid = np.array(valid_df['sales_ratio'], dtype=np.float32)

output_id = valid_df['id'].unique()

del train_df, valid_df

## Create Catboost Pools 

In [None]:
train_pool = Pool(
    data=data_train,
    label=label_train,
    cat_features=cat_features_train
)

valid_pool = Pool(
    data=data_valid,
    label=label_valid,
    cat_features=cat_features_valid
)

del data_train, label_train, cat_features_train
del data_valid, label_valid, cat_features_valid

## Define Objective function

In [None]:
def objective(trial):

    # Define Space 
    params = {
        #'loss_function': trial.suggest_categorical('loss_function', ['RMSE', 'Poisson', 'Tweedie:variance_power=1.5']),
        'depth': trial.suggest_int('depth', 5, 16),
        #'iterations': int(trial.suggest_discrete_uniform('iterations', 100, 1000, 100)),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.01, 10),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Poisson', 'Bernoulli', 'No']),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide'])
    }

    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_uniform('bagging_temperature', 0, 10)

    if params['bootstrap_type'] in ['Poisson', 'Bernoulli']:
        params["subsample"] = trial.suggest_uniform("subsample", 0.1, 1)

    if params['grow_policy'] in ['Depthwise', 'Lossguide']:
        params["min_data_in_leaf"] = trial.suggest_int('min_data_in_leaf', 1, 10)

    if params['grow_policy'] == 'Lossguide':
        params["max_leaves"] = trial.suggest_int('max_leaves', 16, 64)

    # Fixed Params
    params['eval_metric'] = 'RMSE'
    params['use_best_model'] = False
    params['task_type'] = 'GPU'
    params['random_seed'] = 666
    params['verbose'] = 10

    params['loss_function'] = 'RMSE'
    params['iterations'] = 300
    params['early_stopping_rounds'] = 20

    # some print
    print('\n--------------------------------\n')
    pprint.pprint(params)
    print('')
    
    # Define catboost regressor
    cbr = CatBoostRegressor(**params)
    
    # Fit/predict
    cbr.fit(train_pool, eval_set=valid_pool)
    
    forecast = np.array(cbr.get_test_eval())
    best_score = cbr.get_best_score()
    
    # Print metrics
    print('')
    print('Loss:', str(best_score))
    print('')

    # Write logs
    f = open(DATA_PATH + 'external/catboost_top_down_tuning.txt', 'a+')
    f.write(str(params) + '\n\n')
    f.write('Loss: ' + str(best_score) + '\n')
    f.write('\n\n--------------------------------\n\n')

    f.close()
    
    return best_score['validation']['RMSE']

## Optimize

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))