In [778]:
import pandas as pd
import warnings
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import optuna
import catboost
from functools import partial
from catboost import utils

warnings.filterwarnings("ignore")

# Data preparing

In [53]:
data = pd.read_csv('train.csv').sort_values(by=['user_id', 'item_id'])
items = pd.read_csv('item-features.csv').sort_values(by=['item_id'])
users = pd.read_csv('user_features.csv').sort_values(by=['user_id'])

In [54]:
f'{len(data["user_id"].unique())} unique users'

'497 unique users'

In [260]:
f'{len(data["item_id"].unique())} unique items'

'444 unique items'

In [55]:
data.head(3)

Unnamed: 0,user_id,item_id,like,timestamp
3347,0,1,1,1491039949
1187,0,29,0,1490973374
7729,0,48,0,1491181446


In [56]:
items.head(3)

Unnamed: 0,item_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
388,0,0.001433,-0.003243,-0.00303,0.004299,-0.001026,0.001412,0.001671,0.001373,-0.006249,...,-0.008651,-0.00144,0.002312,-0.002225,-0.004108,-0.004108,0.000871,-0.002408,-0.002408,0.000613
169,1,0.002482,-0.005617,-0.005248,0.007446,-0.001777,0.002446,0.002895,0.002378,-0.010824,...,-0.014983,-0.002493,0.004004,-0.003855,-0.007115,-0.007115,0.001508,-0.004171,-0.004171,0.001062
239,2,0.001871,-0.004236,-0.003958,0.005615,-0.00134,0.001845,0.002183,0.001793,-0.008162,...,-0.011299,-0.00188,0.00302,-0.002907,-0.005365,-0.005365,0.001137,-0.003145,-0.003145,0.000801


In [57]:
users.head(3)

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,22,23,24,25,26,27,28,29,30,31
0,0,0.000695,-0.001573,-0.00147,0.002085,-0.000498,0.000685,0.000811,0.000666,-0.003031,...,-0.004196,-0.000698,0.001121,-0.001079,-0.001993,-0.001993,0.000422,-0.001168,-0.001168,0.000297
1,1,0.001204,-0.002725,-0.002546,0.003612,-0.000862,0.001187,0.001404,0.001154,-0.005251,...,-0.007268,-0.001209,0.001942,-0.00187,-0.003451,-0.003451,0.000732,-0.002023,-0.002023,0.000515
2,2,0.000491,-0.001112,-0.001039,0.001475,-0.000352,0.000484,0.000573,0.000471,-0.002144,...,-0.002967,-0.000494,0.000793,-0.000763,-0.001409,-0.001409,0.000299,-0.000826,-0.000826,0.00021


In [110]:
data_big = data.merge(users, left_on='user_id', right_on='user_id'). \
                merge(items, left_on='item_id', right_on='item_id'). \
                drop(columns=['item_id', 'timestamp']). \
                sort_values(by=['user_id']).reset_index(drop=True)

In [111]:
data_big.head(3)

Unnamed: 0,user_id,like,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,...,22_y,23_y,24_y,25_y,26_y,27_y,28_y,29_y,30_y,31_y
0,0,1,0.000695,-0.001573,-0.00147,0.002085,-0.000498,0.000685,0.000811,0.000666,...,-0.014983,-0.002493,0.004004,-0.003855,-0.007115,-0.007115,0.001508,-0.004171,-0.004171,0.001062
1,0,0,0.000695,-0.001573,-0.00147,0.002085,-0.000498,0.000685,0.000811,0.000666,...,-0.002967,-0.000494,0.000793,-0.000763,-0.001409,-0.001409,0.000299,-0.000826,-0.000826,0.00021
2,0,0,0.000695,-0.001573,-0.00147,0.002085,-0.000498,0.000685,0.000811,0.000666,...,-0.003634,-0.000605,0.000971,-0.000935,-0.001726,-0.001726,0.000366,-0.001012,-0.001012,0.000258


In [112]:
user400_index = data_big[data_big['user_id'] == 401].index[0] - 1
df_train = data_big.loc[:user400_index]
df_test = data_big[user400_index+1:]
df_train.shape, df_test.shape

((7067, 66), (1607, 66))

In [113]:
df_train.tail(3)

Unnamed: 0,user_id,like,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,...,22_y,23_y,24_y,25_y,26_y,27_y,28_y,29_y,30_y,31_y
7064,400,0,0.001554,-0.003517,-0.003287,0.004663,-0.001113,0.001532,0.001813,0.001489,...,-0.006635,-0.001104,0.001773,-0.001707,-0.003151,-0.003151,0.000668,-0.001847,-0.001847,0.00047
7065,400,0,0.001554,-0.003517,-0.003287,0.004663,-0.001113,0.001532,0.001813,0.001489,...,-0.007565,-0.001259,0.002022,-0.001946,-0.003592,-0.003592,0.000762,-0.002106,-0.002106,0.000536
7066,400,0,0.001554,-0.003517,-0.003287,0.004663,-0.001113,0.001532,0.001813,0.001489,...,-0.002098,-0.000349,0.000561,-0.00054,-0.000996,-0.000996,0.000211,-0.000584,-0.000584,0.000149


In [114]:
f'{len(df_train["user_id"].unique())} unique users'

'401 unique users'

In [115]:
df_test.head(3)

Unnamed: 0,user_id,like,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,...,22_y,23_y,24_y,25_y,26_y,27_y,28_y,29_y,30_y,31_y
7067,401,0,0.001153,-0.002609,-0.002437,0.003458,-0.000825,0.001136,0.001345,0.001104,...,-0.008392,-0.001397,0.002243,-0.002159,-0.003985,-0.003985,0.000845,-0.002336,-0.002336,0.000595
7068,401,0,0.001153,-0.002609,-0.002437,0.003458,-0.000825,0.001136,0.001345,0.001104,...,-0.002098,-0.000349,0.000561,-0.00054,-0.000996,-0.000996,0.000211,-0.000584,-0.000584,0.000149
7069,401,0,0.001153,-0.002609,-0.002437,0.003458,-0.000825,0.001136,0.001345,0.001104,...,-0.002967,-0.000494,0.000793,-0.000763,-0.001409,-0.001409,0.000299,-0.000826,-0.000826,0.00021


In [116]:
f'{len(df_test["user_id"].unique())} unique users'

'96 unique users'

In [884]:
def to_catboost_dataset(df):
    y = df['like'].to_numpy()
    q = df['user_id'].to_numpy().astype('uint32')        
    X = df.drop(columns=['like', 'user_id']).to_numpy() 
    return (X, y, q)

X_train, y_train, q_train = to_catboost_dataset(df_train)
X_valid, y_valid, q_valid = to_catboost_dataset(df_test)
X_train.shape, y_train.shape, q_train.shape, X_valid.shape, y_valid.shape, q_valid.shape

((7067, 64), (7067,), (7067,), (1607, 64), (1607,), (1607,))

In [885]:
pool_train = catboost.Pool(data=X_train, label=y_train, group_id=q_train)
pool_valid = catboost.Pool(data=X_valid, label=y_valid,group_id=q_valid)

# Training

In [886]:
class Objective:
    
    def __init__(self):
        self.best_catboost = None
        self._catboost = None
        
    def __call__(self, trial, pool_train, pool_valid):
        param = {
            "loss_function": "CrossEntropy",
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 1e-3, 1e-1),
            "depth": trial.suggest_int("depth", 1, 12),
            "bootstrap_type": trial.suggest_categorical(
                "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
            ),
            "grow_policy": trial.suggest_categorical("grow_policy", ['SymmetricTree', 'Depthwise', 'Lossguide']),
            "learning_rate": trial.suggest_float("colsample_bylevel", 1e-3, 1e-1),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-1, 1e1),
            "eval_metric": 'F1',
            "use_best_model": True,
            "early_stopping_rounds": 200,
            "iterations": 10000,
            "random_seed": 0,
            "verbose": False
        }
    
        if param["bootstrap_type"] == "Bayesian":
            param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0.33, 3)
        elif param["bootstrap_type"] == "Bernoulli":
            param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
            
        if param["grow_policy"] == "SymmetricTree":
            param["boosting_type"] = trial.suggest_categorical("boosting_type", ["Ordered", "Plain"])
    
        ranker = catboost.CatBoost(param)
    
        ranker.fit(pool_train, eval_set=pool_valid)
        
        y_valid, q_valid = pool_valid.get_label(), pool_valid.get_group_id_hash()
        
        score = utils.eval_metric(y_valid, ranker.predict(pool_valid), 
                                  'CrossEntropy', group_id=q_valid)[0]
        self._catboost = ranker
        
        return score

    def callback(self, study, trial):
        if study.best_trial == trial:
            self.best_catboost = self._catboost

In [887]:
def start_optimization(
    objective_func, 
    n_trials,
    n_jobs,
    pool_train, 
    pool_valid, 
    study_direction=None,
    sampler=None,
    features=None,
    **other_objective_kwargs
):

    obj_func = partial(objective_func, pool_train=pool_train, pool_valid=pool_valid)

    study = optuna.create_study(sampler=sampler, direction='minimize')
    study.optimize(obj_func, n_trials=n_trials, n_jobs=n_jobs, callbacks=[objective.callback])
    return study

In [888]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

objective = Objective()


tpe_sampler = optuna.samplers.TPESampler(
        n_startup_trials=500, 
        n_ei_candidates=100, 
)
study = start_optimization(objective, n_trials=1000, n_jobs=8, tpe_sampler=tpe_sampler, 
                           pool_train=pool_train, pool_valid=pool_valid, callback=callback)


print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


Number of finished trials: 1000
Best trial:
  Value: 0.21052875247546257
  Params: 
    colsample_bylevel: 0.04248289582448138
    depth: 4
    bootstrap_type: Bernoulli
    grow_policy: Depthwise
    l2_leaf_reg: 4.697338772536601
    subsample: 0.36466589659122406


In [889]:
model = objective.best_catboost

In [890]:
y_train_pred = (model.predict(X_train) > 0).astype(int)
f'Train f1 is {f1_score(y_train, y_train_pred)}'

'Train f1 is 0.6931340495095749'

In [891]:
y_valid_pred = (model.predict(X_valid) > 0).astype(int)
f'Valid f1 is {f1_score(y_valid, y_valid_pred)}'

'Valid f1 is 0.7423167848699764'

# Sosamba (submission)

In [897]:
X_all, y_all, q_all = to_catboost_dataset(data_big)
pool_all = catboost.Pool(data=X_all, label=y_all, group_id=q_all)

In [899]:
best_params = model.get_params()
best_iters = model.get_best_iteration()
best_params, best_iters

({'loss_function': 'CrossEntropy',
  'colsample_bylevel': 0.04248289582448138,
  'depth': 4,
  'bootstrap_type': 'Bernoulli',
  'grow_policy': 'Depthwise',
  'learning_rate': 0.04248289582448138,
  'l2_leaf_reg': 4.697338772536601,
  'eval_metric': 'F1',
  'use_best_model': True,
  'early_stopping_rounds': 200,
  'iterations': 10000,
  'random_seed': 0,
  'verbose': False,
  'subsample': 0.36466589659122406},
 158)

In [900]:
best_params['iterations'] = best_iters
best_params['use_best_model'] = False

In [902]:
model = catboost.CatBoost(best_params)

In [903]:
model.fit(pool_all)

<catboost.core.CatBoost at 0x169993d00>

In [904]:
users['key'] = 1
items['key'] = 1
bigger_data = pd.merge(users, items, on='key')
bigger_data.drop(columns=['key'], inplace=True)
users.drop(columns=['key'], inplace=True)
items.drop(columns=['key'], inplace=True)

In [905]:
preds = model.predict(bigger_data).reshape(len(users), -1)

In [908]:
for i in range(len(users)):
    preds[i, data[(data['like'] == 1) & (data['user_id'] == i)]['item_id'].to_numpy()] = 100

In [909]:
final = pd.DataFrame(np.sort(np.argsort(preds, axis=1)[:, -20:], axis=1))
final.index.name = 'user_id'
final

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,1,7,11,22,35,37,40,60,65,72,76,80,284,286,287,289,292,424,425,441
1,3,4,5,11,22,35,37,60,65,72,76,80,284,286,287,289,292,424,425,441
2,6,11,22,35,37,40,60,65,72,76,80,284,286,287,289,292,383,424,425,441
3,8,9,11,22,35,37,60,65,72,76,80,284,286,287,289,292,383,424,425,441
4,11,12,14,22,35,37,60,65,72,76,80,284,286,287,289,292,383,424,425,441
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,7,11,22,35,37,40,60,65,72,76,80,284,286,287,289,292,383,424,425,441
493,7,11,22,35,37,60,65,71,72,76,80,119,284,286,287,289,292,424,425,441
494,7,11,22,35,37,40,60,65,72,76,80,284,286,287,289,292,383,424,425,441
495,7,11,22,35,37,40,60,65,72,76,80,284,286,287,289,292,383,424,425,441


In [911]:
name = 'best_vk4'
final.to_csv(f'{name}.csv', index=True, header=True)