In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool, MetricVisualizer
from sklearn.model_selection import train_test_split
from tools import compute_metrics
from copy import deepcopy

In [None]:
local_test = pd.read_parquet('files/local_test.parquet.gzip')

candidates_als = pd.read_parquet('files/candidates_als_train.parquet.gzip')

In [None]:
user_features = pd.read_parquet('files/user_features.parquet.gzip').reset_index()
item_features = pd.read_parquet('files/item_features.parquet.gzip')

In [None]:
user_features.head()

In [None]:
item_features.head()

In [None]:
item_features['source_id'] = item_features['source_id'].astype('category')
item_features['item_id'] = item_features['item_id'].astype('int32')
item_features['likes'] = item_features['likes'].astype('int32')
item_features['no_likes'] = item_features['no_likes'].astype('int32')
item_features['dislikes'] = item_features['dislikes'].astype('int32')

## Подготовка данных для ранжирования

In [None]:
def make_target_rank(predict, true_data):
    df_train = predict.merge(true_data, on=['user_id', 'item_id'], how='inner')
    # negative
    neg = predict.merge(true_data, on=['user_id', 'item_id'], how='left')
    neg = neg[neg['timespent'].isnull()]
    neg = neg.groupby('user_id').sample(frac=0.07)
    neg['timespent'] = neg['timespent'].fillna(-1).astype('float16')
    neg['reaction'] = neg['reaction'].astype('float16')
    df_train = pd.concat([df_train,neg]).sort_values(by=['user_id', 'timespent'], ascending=[True, False])
    df_train['rank_targ'] = (df_train.groupby('user_id').cumcount() + 1).astype('int32')
    return df_train

In [None]:
def make_target_rank_2(predict, true_data):
    df_train = predict.merge(true_data, on=['user_id', 'item_id'], how='left')
    df_train = df_train.loc[df_train['timespent'] >= 0].sort_values(by=['user_id', 'timespent'], ascending=[True, False])
    df_train['rank_targ'] = (df_train.groupby('user_id').cumcount() + 1).astype('int32')
    return df_train

In [None]:
df_predict_als = candidates_als.explode('item_id').astype('int32')
df_predict_als['rank'] = (df_predict_als.groupby('user_id').cumcount() + 1).astype('int32')
df_predict_als.head()

In [None]:
#first dataset
#df_train = make_target_rank(df_predict_als, local_test)
#df_train.head()

In [None]:
#second dataset
df_train = make_target_rank_2(df_predict_als, local_test)
df_train.head()

In [None]:
q_1_item = item_features['sum_time'].quantile(0.96)
q_2_item = item_features['sum_time'].quantile(0.04)

q_1_user= user_features['timespent_sum'].quantile(0.96)
q_2_user = user_features['timespent_sum'].quantile(0.04)
#
mask_user = (user_features['timespent_sum'] < q_1_user) & (user_features['timespent_sum'] > q_2_user)
mask_item = (item_features['sum_time'] < q_1_item) & (item_features['sum_time'] > q_2_item)
#
user_features_f = user_features.loc[mask_user]
item_features_f = item_features.loc[mask_item]

df_train = df_train.loc[(df_train['user_id'].isin(user_features_f['user_id'].unique())) 
                          & (df_train['item_id'].isin(item_features_f['item_id'].unique()))].sort_values(by='user_id')

In [None]:
#split train/test user_ids
cat_train_id, cat_test_id = train_test_split(local_test['user_id'].unique(), test_size=0.2)

#split train for validate user_ids
cat_train_id, cat_eval_id =  train_test_split(cat_train_id, test_size=0.1)

In [None]:
cols = ['user_id', 'item_id', 'rank', 'rank_targ']

#Train data
ctb_train = df_train.loc[df_train['user_id'].isin(cat_train_id)][cols]

#Test data
ctb_test = df_train.loc[df_train['user_id'].isin(cat_test_id)][cols]

#Val data
ctb_val = df_train.loc[df_train['user_id'].isin(cat_eval_id)][cols]


In [None]:
item_col = ['item_id', 'likes', 'dislikes', 'sum_time', 'mean_time', 'source_id']

In [None]:
train_feature = ctb_train.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')

val_feature = ctb_val.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')

test_feature = ctb_test.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left')

In [None]:
qid_train = train_feature['user_id'].values

qid_val = val_feature['user_id'].values

qied_test = test_feature['user_id'].values

In [None]:
qid_train

# X,y

In [None]:
drop_col = ['user_id', 'item_id']
target_col = ['rank_targ']
cat_col = ['source_id']

In [None]:
X_train, y_train = train_feature.drop(columns=drop_col + target_col, axis=1), train_feature[target_col]
X_val, y_val = val_feature.drop(columns=drop_col + target_col, axis=1), val_feature[target_col]
X_test, y_test = test_feature.drop(columns=drop_col + target_col, axis=1), test_feature[target_col]

In [None]:
max_relevance = np.max(y_train)
y_train /= max_relevance
y_test /= max_relevance
y_val /= max_relevance

In [None]:
train_p = Pool(
    data=X_train,
    label=y_train,
    group_id=qid_train,
    cat_features=['source_id'],
    #embedding_features=['embeddings']
)

val_p = Pool(
    data=X_val,
    label=y_val,
    group_id=qid_val,
    cat_features=['source_id'],
    #embedding_features=['embeddings']
)

test_p = Pool(
    data=X_test,
    label=y_test,
    group_id=qied_test,
    cat_features=['source_id'],
    #embedding_features=['embeddings']
)

In [None]:
default_parameters = {
    'iterations': 50,
    'custom_metric': ['PrecisionAt:top=20','NDCG:top=20'],
    'verbose': False,
    'random_seed': 0,
    'thread_count':-1,
}

parameters = {}

In [None]:
def fit_model(loss_function, additional_params=None, train_pool=train_p, test_pool=val_p):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool,
              plot=True)
    
    return model

In [None]:
model = fit_model('RMSE',)

In [None]:
model = fit_model('QueryRMSE')

In [None]:
model = fit_model('YetiRank')

In [None]:
widget = MetricVisualizer(['RMSE', 'QueryRMSE', 'PairLogit', 'PairLogitPairwise', 'YetiRank'])
widget.start()

In [None]:
#model.save_model('files/ctb_rank_split_data')
#model.save_model('files/ctb_rank_2split_data')
model.save_model('files/ctb_rank_2split_data_quant')

In [None]:
def predict_rank(model, X, data):
    data_f = data.copy()
    data_f['score'] = model.predict(X)
    data_f['score'] = data_f['score'].astype('float32')
    data_f = data_f.sort_values(by=['user_id', 'score'], ascending=[True, True])# не понятно как релевант
    data_f['rank_pred'] = (data_f.groupby('user_id').cumcount() + 1).astype('int32')
    return data_f

In [None]:
pred_test = predict_rank(model,test_p, cat_data.loc[cat_data['user_id'].isin(cat_test_id)])
pred_test

# Метрика на глобальном тесте

In [None]:
als_ctb_data = df_predict_als.merge(item_features[item_col], 
                                on=['item_id'], 
                                how='left'
                               ).merge(user_features, on=['user_id'], how='left').sort_values(by=['user_id'])
als_ctb_data = als_ctb_data.drop(columns=['user_id', 'item_id'])
#als_ctb_data.to_parquet('files/als_ctb_data_full_test.parquet.gzip')
als_ctb_data .head()

In [None]:
als_ctb_data = pd.read_parquet('files/als_ctb_data_full_test.parquet.gzip')

In [None]:
qied_full_test = df_predict_als.sort_values(by=['user_id'])['user_id'].values

In [None]:
full_test_p = Pool(
    data=als_ctb_data,
    group_id=qied_full_test,
    cat_features=['source_id'],
    #embedding_features=['embeddings']
)


In [None]:
model = CatBoostRanker()
#model.load_model('files/ctb_rank')
#model.load_model('files/ctb_rank_2split_data')
#model.load_model('files/ctb_rank_2split_data')
model.load_model('files/ctb_rank_2split_data_quant')

In [None]:
df_predict = predict_rank(model, full_test_p, df_predict_als)

#df_predict.to_parquet('files/als_ctb_rank_predicted.parquet.gzip')
#df_predict.to_parquet('files/als_ctb_rank_predicted2.parquet.gzip')
df_predict.to_parquet('files/als_ctb_rank_predicted2_quant.parquet.gzip')

In [None]:
#df_predic = pd.read_parquet('files/als_ctb_rank_predicted.parquet.gzip')
df_predic = pd.read_parquet('files/als_ctb_rank_predicted2.parquet.gzip')
df_predic.head()

In [None]:
df_test_full = pd.read_parquet('files/test_full.parquet.gzip')

In [None]:
metrics_als_ctb = compute_metrics(df_test_full[['user_id', 'item_id']], df_predic, top_N=20, rank_col='rank_pred')
metrics_als_ctb

In [None]:
metrics = pd.read_csv('files/metrics.csv',index_col=[0])
#metrics_als_ctb = metrics_als_ctb.to_frame().T
#metrics = pd.concat([metrics,metrics_als_ctb])
#metrics['name_model'] = ['implicit_als', 'implicit_als_ctb_class', 'implicit_als_ctb_rank', 'implicit_als_rank2split']
#metrics.to_csv('files/metrics.csv')
metrics

In [None]:
metrics.plot(kind='bar')