In [1]:
import numpy as np
import pandas as pd
import catboost as cb
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from catboost.utils import create_cd
from bayes_opt import BayesianOptimization

from bayes_opt.observer import JSONLogger
from bayes_opt.event import Events

import config

In [2]:
cat_features = ['customer_id', 'story_id',
                'product_0', 'product_1', 'product_2', 'product_3', 'product_4', 'product_5', 'product_6',
                'marital_status_cd', 'job_position_cd', 'prod_not_nan']

In [3]:
%load_ext autoreload
%autoreload 2

DATA_PATH = config.get_data_path()

TRAIN_FEATURES = DATA_PATH / 'train_features.csv'
TEST_FEATURES = DATA_PATH / 'test_features.csv'

train_df = pd.read_csv(TRAIN_FEATURES, index_col=0, parse_dates=['event_dttm'])
test_df = pd.read_csv(TEST_FEATURES, index_col=0, parse_dates=['event_dttm'])

train_df['nearest_story_seconds_before'] = train_df['nearest_story_seconds_before'].abs()
test_df['nearest_story_seconds_before'] = test_df['nearest_story_seconds_before'].abs()

train_df = train_df.fillna(-999)
test_df = test_df.fillna(-999)

train_df

train_df = train_df.sort_values('event_dttm')

In [4]:
y = train_df['event']
train_df.drop(['event', 'event_dttm', 'first_session_dttm'],inplace=True,axis=1)
test_df.drop(['event', 'event_dttm', 'first_session_dttm'],inplace=True,axis=1)

In [5]:
# def muiltiEncoder(df,cat_features_init,multi_target,sm = 5):
#     encoder = TargetEncoder(cols = cat_features_init,smoothing=sm)
#     vals = []
#     for i in range(multi_target.shape[1]):
#         val = encoder.fit_transform(df[cat_features_init],multi_target[:,i])
#         val.columns = [column + '_target_encode_' + str(i) for column in val.columns]
#         vals += [val]
#     df_drop = df.drop(cat_features_init,axis=1)
#     return pd.concat(vals + [df_drop],axis=1)

In [6]:
# X_train_enc = muiltiEncoder(X_train,cat_features_init,y_train)
# X_val_enc = muiltiEncoder(X_val,cat_features_init,y_train)

In [7]:
X_train, X_val, y_train, y_val = train_test_split(train_df,y,test_size = 0.2,shuffle = False)


In [8]:
mapa = {
    0:10,
    1:0.1,
    2:0.1,
    3:0.5
}

In [None]:
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, y_train,
                        weight=y_train.astype(int).map(mapa), free_raw_data=False)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train,
                       weight=y_val.astype(int).map(mapa), free_raw_data=False)

In [11]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': 4,
    'num_leaves': 20,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.5,
    'bagging_freq': 5,
    'verbose': 1
}

feature_name = X_train.columns.values.tolist()

print('Starting training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,  # eval training data
                feature_name=feature_name,
                categorical_feature=[feature_name.index(feature) for feature in cat_features])

Starting training...
[1]	valid_0's multi_logloss: 1.32839
[2]	valid_0's multi_logloss: 1.29622
[3]	valid_0's multi_logloss: 1.2688
[4]	valid_0's multi_logloss: 1.24883
[5]	valid_0's multi_logloss: 1.22752
[6]	valid_0's multi_logloss: 1.2071
[7]	valid_0's multi_logloss: 1.19033
[8]	valid_0's multi_logloss: 1.17454
[9]	valid_0's multi_logloss: 1.16031
[10]	valid_0's multi_logloss: 1.14822
[11]	valid_0's multi_logloss: 1.13724
[12]	valid_0's multi_logloss: 1.12913
[13]	valid_0's multi_logloss: 1.12014
[14]	valid_0's multi_logloss: 1.1122
[15]	valid_0's multi_logloss: 1.10633
[16]	valid_0's multi_logloss: 1.09987
[17]	valid_0's multi_logloss: 1.09382
[18]	valid_0's multi_logloss: 1.08757
[19]	valid_0's multi_logloss: 1.08168
[20]	valid_0's multi_logloss: 1.07722
[21]	valid_0's multi_logloss: 1.0729
[22]	valid_0's multi_logloss: 1.06942
[23]	valid_0's multi_logloss: 1.06521
[24]	valid_0's multi_logloss: 1.06315
[25]	valid_0's multi_logloss: 1.06135
[26]	valid_0's multi_logloss: 1.05747
[27]

KeyboardInterrupt: 