In [1]:
## Version futuresales lib required = 0.2.0

!pip install -i https://test.pypi.org/simple/ futuresales_denissimo==0.2.0
!pip install plotly==5.3.1
!pip install neptune-client
!pip install neptune-xgboost
!pip install sklearn

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import futuresales as fs

from futuresales.distribution import from_pickle, to_pickle
from futuresales.utils import make_scaled, load_credentials
from futuresales.validation import Validator, make_hpo_dataset, get_statistics

from neptune.new.types import File

# Base neptune configuration

In [3]:
import neptune.new as neptune
import xgboost as xgb
from neptune.new.integrations.xgboost import NeptuneCallback

cred = {
    "project": "denissimo/FS-Results",
    "api_token": "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIyMTIyMjhjMi04MzMyLTQ2MTItOTZkZS0yZmMxNTg1YzcyNWUifQ=="
}

run = neptune.init(
    api_token=cred['api_token'],
    project=cred['project'],
    tags=['xgboost', 'baseline', 'boruta'],
    run='FSRES-22'
)  # your credentials

# Dataset preparation

In [4]:
task_df = {}
task_df['test'] = pd.read_csv('/kaggle/input/fsfeaturesbaseline/task_df_test.csv')
task_df['train'] = pd.read_csv('/kaggle/input/fsfeaturesbaseline/task_df_train.csv')
task_df['idx'] = pd.read_csv('/kaggle/input/fsfeaturesbaseline/idx.csv')

train = pd.read_csv('/kaggle/input/fsfeaturesbaseline/baseline_train_set.csv')
test = pd.read_csv('/kaggle/input/fsfeaturesbaseline/baseline_test_set.csv')
validation = pd.read_csv('/kaggle/input/fsfeaturesbaseline/baseline_validation_set.csv')

selector = [ True,  True, False, False,  True,  True,  True, False,  True,
        True,  True, False,  True, False,  True, False, False, False,
        True, False, False, False,  True,  True, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
        True, False, False, False,  True, False,  True, False,  True,
       False,  True, False,  True, False, False, False, False, False,
       False, False]

In [5]:
train_x = train.drop(['valid_target', 'target'], axis=1)
train_y = train.loc[:, ['target']]
train_y_valid = train.loc[:, ['valid_target']]

test_x = test.drop(['valid_target', 'target'], axis=1)
test_y = test.loc[:, ['target']]
test_y_valid = test.loc[:, ['valid_target']]

validation_x = validation.drop(['valid_target', 'target'], axis=1)
validation_y = validation.loc[:, ['target']]
validation_y_valid = validation.loc[:, ['valid_target']]

train_x = make_scaled(train_x).drop('id', axis=1).reset_index().rename({'index': 'id'}, axis=1).set_index('id')
test_x = make_scaled(test_x).drop('id', axis=1).reset_index().rename({'index': 'id'}, axis=1).set_index('id')
validation_x = make_scaled(validation_x).drop('id', axis=1).reset_index().rename({'index': 'id'}, axis=1).set_index('id')

In [6]:
train_x = train_x.loc[:, selector].reset_index().rename({'index': 'id'},axis=1).merge(task_df['idx'][['shop_id', 'item_id', 'id']], on='id').set_index('id')
test_x = test_x.loc[:,selector].reset_index().rename({'index': 'id'},axis=1).merge(task_df['idx'][['shop_id', 'item_id', 'id']], on='id').set_index('id')
validation_x = validation_x.loc[:,selector].reset_index().rename({'index': 'id'},axis=1).merge(task_df['idx'][['shop_id', 'item_id', 'id']], on='id').set_index('id')

# Model tuning

In [8]:
# define a search space
import seaborn as sns
from xgboost import XGBRegressor
from futuresales.utils import hyperopt_objective
from hyperopt import hp
from hyperopt import fmin, tpe, Trials

searching_space = {
    'n_estimators': hp.choice('n_estimators', np.arange(1, 150, dtype=int)),
    'learning_rate': hp.quniform('eta', 0.1, 1, 0.05),
    'max_depth':  hp.choice('max_depth', np.arange(1, 15, dtype=int)),
    'reg_alpha': hp.qloguniform('reg_alpha', -20, 20, 1),
    'reg_lambda': hp.qloguniform('reg_lambda', -20, 20, 1),
}


trials = Trials()
xgb_obj = hyperopt_objective(
    XGBRegressor,
    (train_x, train_y),
    (test_x, test_y)
)
# minimize the objective over the space
hpo = trials.fmin(xgb_obj, searching_space, algo=tpe.suggest, max_evals=100, verbose=2)

In [10]:
hpo_df, min_hyp = make_hpo_dataset(trials)

In [11]:
min_hyp

In [12]:
run['hpo'] = {
    'min_params': hpo,
    'trials': File.as_html(hpo_df),
    'trials_pkl': File.as_pickle(hpo_df),
    'cross-dependency': File.as_image(sns.pairplot(data=hpo_df, hue='loss_type').figure)
}

In [13]:
errors = {}

In [14]:
neptune_callback = NeptuneCallback(run=run, log_tree=[0, 1, 2, 3, 4, 5, 6, 7])

In [15]:
xgb_validator = Validator(XGBRegressor, **hpo)

xgb_validator.fit(test_x, test_y)
errors['xgb'] = xgb_validator.validate(validation_x, validation_y_valid)
residulas = get_statistics(errors['xgb'])

run['params'] = min_hyp

run['score/rmse/test'] = errors['xgb'][0]
run['score/rmse/train'] = errors['xgb'][1]
run['model'] = 'Xgboost'
run['dump/residuals'].upload(File.as_pickle(residulas))
run['dump/model'].upload(File.as_pickle(xgb_validator))

In [16]:
reg = xgb.XGBRegressor(**hpo)

# Fit the model and log metadata to the run in Neptune
reg.fit(
    train_x,
    train_y,
    eval_metric=['rmse'],
    eval_set=[(train_x, train_y.to_numpy().transpose()[0]), (test_x, test_y_valid.to_numpy().transpose()[0])],
    callbacks=[
        neptune_callback,
    ]
)

In [17]:
run.stop()