In [1]:
import pandas as pd
import numpy as np
#from prepare_data import make_features
from catboost import CatBoostRanker,CatBoostClassifier, Pool, MetricVisualizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle
from tools import compute_metrics
from copy import deepcopy

In [6]:
df_test = pd.read_parquet('files/df_test.parquet.gzip')

df_predict_als = pd.read_parquet('files/df_predict_train.parquet.gzip')

In [4]:
#make_features() #Генерация признаков. Сохранение в файл user_features,csv item_features.csv

In [10]:
user_features = pd.read_parquet('files/user_features.parquet.gzip')
item_features = pd.read_parquet('files/item_features.parquet.gzip')

In [11]:
user_features.head()

Unnamed: 0_level_0,timespent_sum,time_spent_mean,reaction_sum,reaction_mean,cou_posts
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,10.0,0.294118,0.0,0.0,34
1,75.0,0.949367,0.0,0.0,79
2,31.0,0.607843,1.0,0.019608,51
3,124.0,0.512397,0.0,0.0,242
4,29.0,0.208633,0.0,0.0,139


In [12]:
item_features.head()

Unnamed: 0,item_id,likes,no_likes,dislikes,sum_time,mean_time,source_id,embeddings
0,0,0,13,2,10.0,0.666667,7340,"[0.10458118, 0.047880154, 0.030944156, -0.0351..."
1,1,0,1415,4,565.0,0.398168,6284,"[0.035625108, -0.039264094, -0.03310334, -0.04..."
2,2,0,146,13,251.0,1.578616,12766,"[0.08418761, 0.006732465, -0.0037112322, -0.02..."
3,3,0,230,0,69.0,0.3,14734,"[0.049901545, 0.039079394, -0.03890682, -0.053..."
4,4,0,1114,1,239.0,0.21435,22557,"[0.09303163, 0.023448057, 0.0029488814, -0.017..."


In [13]:
item_features['source_id'] = item_features['source_id'].astype('category')
item_features['item_id'] = item_features['item_id'].astype('int32')
item_features['likes'] = item_features['likes'].astype('int32')
item_features['no_likes'] = item_features['no_likes'].astype('int32')
item_features['dislikes'] = item_features['dislikes'].astype('int32')

## Подготовка данных для ранжирования

In [14]:
df_predict_als = df_predict_als.explode('item_id').astype('int32')
df_predict_als['rank'] = (df_predict_als.groupby('user_id').cumcount() + 1).astype('int32')
df_predict_als.head()

Unnamed: 0,user_id,item_id,rank
0,0,39846,1
0,0,55041,2
0,0,165339,3
0,0,54559,4
0,0,7286,5


In [13]:
#positive
df_train = df_predict_als.merge(df_test, on=['user_id', 'item_id'], how='inner')
df_train['target'] = 1
df_train['target'] = df_train['target'].astype('int8')
df_train

Unnamed: 0,user_id,item_id,rank,timespent,reaction,target
0,1,107288,11,1,0,1
1,7,157919,26,3,0,1
2,19,125258,34,2,0,1
3,21,65403,30,4,0,1
4,28,152712,9,1,0,1
...,...,...,...,...,...,...
388473,1000167,225171,3,1,0,1
388474,1000170,77435,72,18,0,1
388475,1000172,111648,11,2,1,1
388476,1000172,42481,76,3,0,1


In [14]:
# negative
neg = df_predict_als.merge(df_test, on=['user_id', 'item_id'], how='left')
neg = neg[neg['timespent'].isnull()]
neg = neg.groupby('user_id').sample(frac=0.07)
neg['target'] = 0
neg['timespent'] = neg['timespent'].astype('float16')
neg['reaction'] = neg['reaction'].astype('float16')
neg['target'] = neg['target'].astype('int8')
neg.head()

Unnamed: 0,user_id,item_id,rank,timespent,reaction,target
24,0,58347,25,,,0
1,0,55041,2,,,0
53,0,198449,54,,,0
87,0,3675,88,,,0
95,0,196454,96,,,0


In [15]:
df_train = pd.concat([df_train,neg])
df_train.head()

Unnamed: 0,user_id,item_id,rank,timespent,reaction,target
0,1,107288,11,1.0,0.0,1
1,7,157919,26,3.0,0.0,1
2,19,125258,34,2.0,0.0,1
3,21,65403,30,4.0,0.0,1
4,28,152712,9,1.0,0.0,1


In [16]:
#split train/test user_ids
cat_train_id, cat_test_id = train_test_split(df_test['user_id'].unique(), test_size=0.2)

#split train for validate user_ids
cat_train_id, cat_eval_id =  train_test_split(cat_train_id, test_size=0.1)

In [17]:
cols = ['user_id', 'item_id', 'rank', 'target']

#Train data
ctb_train = shuffle(
    df_train.loc[df_train['user_id'].isin(cat_train_id)][cols]
)

#Test data
ctb_test = shuffle(
    df_train.loc[df_train['user_id'].isin(cat_test_id)][cols]
)

#Val data
ctb_val = shuffle(
    df_train.loc[df_train['user_id'].isin(cat_eval_id)][cols]
)


In [16]:
item_col = ['item_id', 'likes', 'dislikes', 'sum_time', 'mean_time', 'source_id']

In [19]:
train_feature = ctb_train.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')

val_feature = ctb_val.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')

test_feature = ctb_test.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')

# X,y

In [20]:
drop_col = ['user_id', 'item_id']
target_col = ['target']
cat_col = ['source_id']

In [21]:
X_train, y_train = train_feature.drop(columns=drop_col + target_col, axis=1), train_feature[target_col]
X_val, y_val = val_feature.drop(columns=drop_col + target_col, axis=1), val_feature[target_col]
X_test, y_test = test_feature.drop(columns=drop_col + target_col, axis=1), test_feature[target_col]

In [22]:
est_param = {
    'subsample': 0.9,
    'max_depth': 4,
    'n_estimators': 150,
    'learning_rate':0.1,
    'thread_count': -1,
    'random_state': 42,
    'verbose': 100,
}

ctb_model = CatBoostClassifier(**est_param)

In [23]:
ctb_model.fit(X_train, y_train, 
              eval_set=(X_val, y_val), 
              early_stopping_rounds=20, 
              cat_features=cat_col, 
              plot=True)
ctb_model.save_model('ctb_model')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6157866	test: 0.6158290	best: 0.6158290 (0)	total: 1.23s	remaining: 3m 3s
100:	learn: 0.1653109	test: 0.1656580	best: 0.1656580 (100)	total: 1m 20s	remaining: 39s
149:	learn: 0.1628326	test: 0.1631872	best: 0.1631872 (149)	total: 1m 57s	remaining: 0us

bestTest = 0.1631872018
bestIteration = 149



In [24]:
y_pred = ctb_model.predict_proba(X_test)
'ROC_AUC = {:.2f}'.format(roc_auc_score(y_test, y_pred[:,1]))

'ROC_AUC = 0.83'

# Метрика на глобальном тесте

In [17]:
als_ctb_data = df_predict_als.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')
als_ctb_data = als_ctb_data.drop(columns=['user_id', 'item_id'])
als_ctb_data .head()

Unnamed: 0,rank,likes,dislikes,sum_time,mean_time,source_id,timespent_sum,time_spent_mean,reaction_sum,reaction_mean,cou_posts
0,1,37,34,77729.0,1.277408,12893,10.0,0.294118,0.0,0.0,34
1,2,25,94,98267.0,4.83312,13985,10.0,0.294118,0.0,0.0,34
2,3,19,20,26402.0,0.884163,854,10.0,0.294118,0.0,0.0,34
3,4,18,75,26420.0,0.584229,1807,10.0,0.294118,0.0,0.0,34
4,5,21,8,47610.0,1.108034,6629,10.0,0.294118,0.0,0.0,34


In [20]:
ctb_model = CatBoostClassifier()
ctb_model.load_model('ctb_model')

<catboost.core.CatBoostClassifier at 0x29189ee7a60>

In [21]:
df_predict_als['ctb_pred'] = ctb_model.predict_proba(als_ctb_data)[:,1].astype('float32')
df_predict_als.head()

Unnamed: 0,user_id,item_id,rank,ctb_pred
0,0,39846,1,0.082326
0,0,55041,2,0.024407
0,0,165339,3,0.038186
0,0,54559,4,0.025324
0,0,7286,5,0.019586


In [22]:
df_predict_als = df_predict_als.sort_values(by=['user_id', 'ctb_pred'], ascending=[True, False])
df_predict_als['rank_ctb'] = (df_predict_als.groupby('user_id').cumcount() + 1).astype('int32')
#df_predict_als.to_parquet('files/als_ctb_predicted.parquet.gzip')
df_predict_als.head()

Unnamed: 0,user_id,item_id,rank,ctb_pred,rank_ctb
0,0,31995,100,0.096122,1
0,0,39846,1,0.082326,2
0,0,93615,12,0.04425,3
0,0,1537,7,0.040119,4
0,0,19011,17,0.038933,5


In [2]:
df_predic = pd.read_parquet('files/als_ctb_predicted.parquet.gzip')
df_predic.head()

Unnamed: 0,user_id,item_id,rank,ctb_pred,rank_ctb
0,0,39846,1,0.149182,1
0,0,31995,100,0.126424,2
0,0,1537,7,0.044547,3
0,0,93615,12,0.043901,4
0,0,165339,3,0.035277,5


In [None]:
df_test_full = pd.read_parquet('files/test_full.parquet.gzip')

In [5]:
metrics_als = compute_metrics(df_test_full[['user_id', 'item_id']], df_predic, top_N=20, rank_col='rank')
metrics_als

Precision@20    0.007081
Recall@20       0.024146
MAP@20          0.007839
MRR             0.033904
dtype: float64

In [6]:
metrics_als_ctb = compute_metrics(df_test_full[['user_id', 'item_id']], df_predic, top_N=20, rank_col='rank_ctb')
metrics_als_ctb

Precision@20    0.010329
Recall@20       0.034523
MAP@20          0.011479
MRR             0.033904
dtype: float64