In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker,CatBoostClassifier, Pool, MetricVisualizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from tools import compute_metrics
from copy import deepcopy

In [None]:
local_test = pd.read_parquet('files/local_test.parquet.gzip')

candidates_als = pd.read_parquet('files/candidates_als_train.parquet.gzip')

In [None]:
user_features = pd.read_parquet('files/user_features.parquet.gzip')
item_features = pd.read_parquet('files/item_features.parquet.gzip')

In [None]:
user_features.head()

In [None]:
item_features.head()

In [None]:
item_features['source_id'] = item_features['source_id'].astype('category')
item_features['item_id'] = item_features['item_id'].astype('int32')
item_features['likes'] = item_features['likes'].astype('int32')
item_features['no_likes'] = item_features['no_likes'].astype('int32')
item_features['dislikes'] = item_features['dislikes'].astype('int32')

## Подготовка данных для ранжирования

In [None]:
df_predict_als = candidates_als.explode('item_id').astype('int32')
df_predict_als['rank'] = (df_predict_als.groupby('user_id').cumcount() + 1).astype('int32')
df_predict_als.head()

In [None]:
def make_target(predict, true_data):
    #positive
    df_train = predict.merge(true_data, on=['user_id', 'item_id'], how='inner')
    df_train['target'] = 1
    df_train['target'] = df_train['target'].astype('int8')
    # negative
    neg = predict.merge(true_data, on=['user_id', 'item_id'], how='left')
    neg = neg[neg['timespent'].isnull()]
    neg = neg.groupby('user_id').sample(frac=0.07)
    neg['target'] = 0
    neg['timespent'] = neg['timespent'].astype('float16')
    neg['reaction'] = neg['reaction'].astype('float16')
    neg['target'] = neg['target'].astype('int8')
    df_train = pd.concat([df_train,neg])
    return df_train

In [None]:
df_train = make_target(df_predict_als, local_test)
df_train.head()

In [None]:
#split train/test user_ids
cat_train_id, cat_test_id = train_test_split(local_test['user_id'].unique(), test_size=0.2)

#split train for validate user_ids
cat_train_id, cat_eval_id =  train_test_split(cat_train_id, test_size=0.1)

In [None]:
cols = ['user_id', 'item_id', 'rank', 'target']

#Train data
ctb_train = shuffle(
    df_train.loc[df_train['user_id'].isin(cat_train_id)][cols]
)

#Test data
ctb_test = shuffle(
    df_train.loc[df_train['user_id'].isin(cat_test_id)][cols]
)

#Val data
ctb_val = shuffle(
    df_train.loc[df_train['user_id'].isin(cat_eval_id)][cols]
)


In [None]:
item_col = ['item_id', 'likes', 'dislikes', 'sum_time', 'mean_time', 'source_id']

In [None]:
train_feature = ctb_train.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')

val_feature = ctb_val.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')

test_feature = ctb_test.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')

# X,y

In [None]:
drop_col = ['user_id', 'item_id']
target_col = ['target']
cat_col = ['source_id']

In [None]:
X_train, y_train = train_feature.drop(columns=drop_col + target_col, axis=1), train_feature[target_col]
X_val, y_val = val_feature.drop(columns=drop_col + target_col, axis=1), val_feature[target_col]
X_test, y_test = test_feature.drop(columns=drop_col + target_col, axis=1), test_feature[target_col]

In [None]:
est_param = {
    'subsample': 0.9,
    'max_depth': 4,
    'n_estimators': 150,
    'learning_rate':0.1,
    'thread_count': -1,
    'random_state': 42,
    'verbose': 100,
}

ctb_model = CatBoostClassifier(**est_param)

In [None]:
ctb_model.fit(X_train, y_train, 
              eval_set=(X_val, y_val), 
              early_stopping_rounds=20, 
              cat_features=cat_col, 
              plot=True)
ctb_model.save_model('ctb_model')

In [None]:
y_pred = ctb_model.predict_proba(X_test)
'ROC_AUC = {:.2f}'.format(roc_auc_score(y_test, y_pred[:,1]))

# Метрика на глобальном тесте

In [None]:
als_ctb_data = df_predict_als.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')
als_ctb_data = als_ctb_data.drop(columns=['user_id', 'item_id'])
als_ctb_data .head()

In [None]:
ctb_model = CatBoostClassifier()
ctb_model.load_model('ctb_model')

In [None]:
df_predict_als['ctb_pred'] = ctb_model.predict_proba(als_ctb_data)[:,1].astype('float32')
df_predict_als.head()

In [None]:
df_predict_als = df_predict_als.sort_values(by=['user_id', 'ctb_pred'], ascending=[True, False])
df_predict_als['rank_ctb'] = (df_predict_als.groupby('user_id').cumcount() + 1).astype('int32')
df_predict_als.to_parquet('files/als_ctb_predicted.parquet.gzip')
df_predict_als.head()

In [None]:
df_predic = pd.read_parquet('files/als_ctb_predicted.parquet.gzip')
df_predic.head()

In [None]:
df_test_full = pd.read_parquet('files/test_full.parquet.gzip')

In [None]:
metrics_als = compute_metrics(df_test_full[['user_id', 'item_id']], df_predic, top_N=20, rank_col='rank')
metrics_als

In [None]:
metrics_als_ctb = compute_metrics(df_test_full[['user_id', 'item_id']], df_predic, top_N=20, rank_col='rank_ctb')
metrics_als_ctb

In [None]:
full_metrics = {}
for i,v in metrics_als.items():
    full_metrics[i] = [v,metrics_als_ctb[i]]
full_metrics

In [None]:
pd.DataFrame(full_metrics).to_csv('files/metrics.csv')