In [1]:
import pandas as pd

from utils import *
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from lightfm import LightFM
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from recbole.utils import init_logger, get_model, init_seed
import pickle
from logging import getLogger
import torch
from recbole.data import create_dataset, data_preparation
from models import *
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
import scipy.sparse as sp



In [2]:
class COSIN:

    def __init__(self, data, top_N=10, mdl_params={}, filter_already_liked_items=True):
        self.data=data
        self.model = CosineRecommender(**mdl_params)
        self.filter_already_liked_items = filter_already_liked_items
        self.top_N = top_N
        self.users_inv_mapping = dict(enumerate(data['user_id'].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}
        self.items_inv_mapping = dict(enumerate(data['item_id'].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}

    def get_coo_matrix(self, df,
                       user_col='user_id',
                       item_col='item_id',
                       weight_col=None):
        if weight_col is None:
            weights = np.ones(len(df), dtype=np.float32)
        else:
            weights = df[weight_col].astype(np.float32)
        interaction_matrix = sp.coo_matrix((
            weights,
            (
                df[user_col].map(self.users_mapping),
                df[item_col].map(self.items_mapping)
            )
        ))
        return interaction_matrix

    def train(self):
        # preprocess matrix
        self.train_mat = self.get_coo_matrix(df = self.data).tocsr()
        self.model.fit(self.train_mat)
        return

 

    def predict(self, users):
        userids = [self.users_mapping[u] for u in users]
        recs, scores = self.model.recommend(userids, self.train_mat[userids], self.top_N,
                               filter_already_liked_items=self.filter_already_liked_items)

        a = pd.DataFrame({'users':users, 'item_id':(r for r in recs)})
        a = a.explode('item_id').reset_index(drop=True)
        a['rnk'] = a.groupby('users').cumcount()+1
        a['items'] = a['item_id'].map(self.items_inv_mapping)
        return a

In [3]:
with open('trained_models/gru_supermarket_mapping.pickle', 'rb') as f:
    mapper = pickle.load(f)

with open('trained_models/gru_supermarket_embeddings.pickle', 'rb') as f:
    embeddings = pickle.load(f)

In [4]:
def add_embeddings(df):
    def embeddings_giver(row):
        if str(row) in mapper['item_id']:
            internal_index = mapper['item_id'][str(row)]
            emb = embeddings[:][internal_index]
        else:
            emb = np.ones(64)
        return emb
    column_names = []
    df['embedding'] = df['item_id'].apply(embeddings_giver)
    for i in range(64):
        column_names.append(f"column_{i + 1}")

    # Now you can assign values to these new columns based on your needs, for example:
    # Let's say you want to fill the new columns with values from the 'embedding' column
    df[column_names] = pd.DataFrame(df.embedding.tolist(), index= df.index)
    df.drop('embedding', axis=1, inplace=True)
    return df, column_names

In [5]:
interactions, interactions_val, interactions_val_target = read_data_supermarket()

In [60]:
all_data = pd.read_csv('train_val_supermarket_vov_feats.csv', index_col='Unnamed: 0')

In [7]:
train, test = split_data(interactions)
items_df = pd.read_csv('../content/items_supermarket.csv').drop('Unnamed: 0', axis=1)
X_test, y_test = split_last_element(test)

100%|██████████| 65319/65319 [00:00<00:00, 521843.32it/s]


In [8]:
devices_dict = {}
devices = interactions.device_id.unique()
for i in range(0, len(devices)):
    devices_dict[devices[i]] = i

In [9]:
def add_devices(df, father_df):
    def device_mapper(user_id):
        device = father_df.loc[father_df['user_id'] == user_id, 'device_id'].values[0]
        if device in devices_dict:
            return devices_dict[device]
        else:
            return -1
    df['device_number'] = df['user_id'].apply(device_mapper)
    return df


In [10]:
cosin_model = COSIN(X_test, top_N=50)

In [11]:
cosin_model.train()



  0%|          | 0/12080 [00:00<?, ?it/s]

In [12]:
cos_candidates = cosin_model.predict(X_test['user_id'].unique())

In [13]:
cos_candidates = cos_candidates[['users', 'rnk', 'items']]

In [14]:
cos_candidates.columns = ['user_id', 'rank_cos', 'item_id']

In [15]:
cos_candidates['item_id'] = cos_candidates['item_id']

In [16]:
cos_candidates['item_id']

0          116834.0
1          101390.0
2          102557.0
3          115050.0
4          113001.0
             ...   
3265945         NaN
3265946         NaN
3265947         NaN
3265948         NaN
3265949         NaN
Name: item_id, Length: 3265950, dtype: float64

In [17]:
popular_model = train_popular_model(train)
popular_candidates = popularmakePreds(popular_model, X_test, 50)

cooc_models = {}
cooc_reccomender = train_cooc_model(train)
for device_id in train['device_id'].unique():
    cooc_models[device_id] = CoocurenceRecommender(cooc_reccomender[device_id])

100%|██████████| 25/25 [00:01<00:00, 20.55it/s]


In [18]:
X_test['user_id'].nunique(), X_test['user_id'].nunique() * 50

(65319, 3265950)

In [19]:
coon_candidates = coocmakePreds(cooc_models, X_test, 50)

In [20]:
candidates = pd.merge(coon_candidates, popular_candidates, on=['user_id', 'item_id'], how='outer')
candidates = pd.merge(candidates, cos_candidates, on=['user_id', 'item_id'], how='outer')

In [21]:
candidates

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos
0,9127023800,113745,1.0,,
1,9127023800,100826,2.0,39.0,
2,9127023800,109607,3.0,11.0,
3,9127023800,103569,4.0,,
4,9127023800,111425,5.0,,
...,...,...,...,...,...
7225565,16209974490,,,,46.0
7225566,16209974490,,,,47.0
7225567,16209974490,,,,48.0
7225568,16209974490,,,,49.0


In [61]:
all_data.drop('local_date', axis=1, inplace=True)

In [62]:
all_data = all_data.rename(columns={'local_dt':'local_date', 'receipt_id':'user_id'})

In [24]:
y_test['target'] = 1

In [25]:
items_nan = items_df[~items_df['item_id'].isin(candidates['item_id'])]

In [26]:
user_fake = {}
for user in X_test['user_id'].unique():
    user_fake[user] = list(items_nan.sample(15)['item_id'].values)

In [27]:
user_fake_df = pd.DataFrame(list(user_fake.items()))

In [28]:
user_fake_df.columns = ['user_id', 'item_id']
user_fake_df = user_fake_df.explode('item_id')

In [29]:
user_fake_df['rank_cooc'] = np.nan
user_fake_df['rank_popular'] = np.nan
user_fake_df['rank_cos'] = np.nan

In [30]:
user_fake_df

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos
0,9127023800,113349,,,
0,9127023800,113227,,,
0,9127023800,105217,,,
0,9127023800,109547,,,
0,9127023800,113564,,,
...,...,...,...,...,...
65318,16209974490,110479,,,
65318,16209974490,111208,,,
65318,16209974490,103867,,,
65318,16209974490,101459,,,


In [31]:
candidates = pd.concat([candidates, user_fake_df])

In [32]:
candidates = pd.merge(candidates, y_test, on=['user_id', 'item_id'], how='outer')
candidates

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos,target
0,9127023800,113745,1.0,,,
1,9127023800,100826,2.0,39.0,,
2,9127023800,109607,3.0,11.0,,
3,9127023800,103569,4.0,,,
4,9127023800,111425,5.0,,,
...,...,...,...,...,...,...
8246235,16209321711,101472,,,,1.0
8246236,16209706543,101519,,,,1.0
8246237,16209827576,114607,,,,1.0
8246238,16209947725,110974,,,,1.0


In [33]:
candidates = pd.merge(candidates, items_df, on='item_id', how="left")

In [34]:
# Remove any remaining non-Russian characters (if needed)
import re
def keep_only_russian(text):
    russian_pattern = re.compile('[А-Яа-я]+')
    try:
        answer = ' '.join(russian_pattern.findall(text))
    except:
        return 'Не опознан'
    return answer



In [35]:
candidates

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos,target,name
0,9127023800,113745,1.0,,,,Сиг-ты Winston Compact Plus Blue МТ
1,9127023800,100826,2.0,39.0,,,Вермишель б/п Экспресс 50гр курица м/уп.
2,9127023800,109607,3.0,11.0,,,Пакет ПНД 340(+2*90)*600мм 23мкм Майка белый с...
3,9127023800,103569,4.0,,,,К-са Докторская по-Стародворски вар./Стародвор...
4,9127023800,111425,5.0,,,,Пирожок жар.с капустой 75гр /ПО Переслегино/
...,...,...,...,...,...,...,...
8246235,16209321711,101472,,,,1.0,Водка Пшеничная 40% 0.1л стакан /ООО Чебоксарс...
8246236,16209706543,101519,,,,1.0,Водка Сормовская люкс 40% 0.7л /Сордис/
8246237,16209827576,114607,,,,1.0,Сухарики Три корочки 40гр ржаные томат с зелен...
8246238,16209947725,110974,,,,1.0,Пиво Арсенальное ледяное светлое пастер. 4.7% ...


In [36]:
candidates['target'].fillna(0, inplace=True)

In [37]:
candidates.target.value_counts()

target
0.0    8180921
1.0      65319
Name: count, dtype: int64

In [66]:
pos = candidates[candidates['target']==1]
pos

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos,target,name
289,9128151500,117146,31.0,22.0,,1.0,Яйцо куриное столовое 1кат /Птицефабрика Борки/
482,9129418371,112734,6.0,10.0,3.0,1.0,Салат Местный /ПО Переслегино/
531,9129533247,115873,1.0,1.0,,1.0,Хлеб Дарницкий 600гр нарезка м/уп. /Пореченски...
556,9129542359,107560,24.0,,,1.0,Минтай жар. в тесте/ПО Переслегино/
635,9129658696,100316,9.0,,9.0,1.0,Батон Нарезной 1с 300гр нарезка м/уп. / /
...,...,...,...,...,...,...,...
8246235,16209321711,101472,,,,1.0,Водка Пшеничная 40% 0.1л стакан /ООО Чебоксарс...
8246236,16209706543,101519,,,,1.0,Водка Сормовская люкс 40% 0.7л /Сордис/
8246237,16209827576,114607,,,,1.0,Сухарики Три корочки 40гр ржаные томат с зелен...
8246238,16209947725,110974,,,,1.0,Пиво Арсенальное ледяное светлое пастер. 4.7% ...


In [67]:
neg = candidates[candidates['target']==0].sample(int(100_000))
# neg = pd.merge(neg, items_df, on='item_id', how="left")
# neg['name'] = neg['name'].apply(keep_only_russian)
neg

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos,target,name
2474816,11446568269,101090,,34.0,,0.0,Вино Густаре стол.кр. п/сл.11% 0.75л бут/Вилаш...
2954830,12482031146,106299,,12.0,,0.0,Крупа ячневая
3657264,14528312335,111066,,38.0,,0.0,Пиво Криница Крепкое свет.паст.6.5% 1.4л ПЭТ /...
1107199,14107628223,115879,22.0,18.0,,0.0,Хлеб Подовый ржаной простой 450гр нарезка м/уп...
1771959,9699466264,105784,,22.0,,0.0,Косточка куриная суповая зам. пакет /Птицефабр...
...,...,...,...,...,...,...,...
416709,11064835239,104892,23.0,,,0.0,Квас Хлебный 2.5л ПЭТ/Брянскпиво/
2429654,11343712959,106664,,38.0,,0.0,Лимоны
2507582,11535046963,107737,,13.0,,0.0,Молоко разливное 4% 1л /Зеленые фермы/
7098628,15831221851,105645.0,,,42.0,0.0,Корзина ритуальная К20


In [68]:
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [69]:
neg, embedding_columns = add_embeddings(neg)

In [70]:
pos, _ = add_embeddings(pos)

In [71]:
neg = add_devices(neg, test)

In [72]:
pos = add_devices(pos, test)

In [73]:
all_data.drop(columns=['device_id', 'user_id'], inplace=True)

KeyError: "['device_id', 'user_id'] not found in axis"

In [None]:
pos = pos.merge(interactions[['user_id', 'local_date']], on='user_id', how="inner").drop_duplicates()

In [None]:
all_data['item_id'] = all_data['item_id'].astype(np.int64)
pos['item_id'] = pos['item_id'].astype(np.int64)

In [None]:
pos['local_date'] = pd.to_datetime(pos['local_date'], format='%Y-%m-%d', errors='ignore')
all_data['local_date'] = pd.to_datetime(all_data['local_date'], format='%Y-%m-%d', errors='ignore')
pos = pos.sort_values(by=['local_date'])
all_data = all_data.sort_values(by=['local_date'])

In [None]:
pos = pos.merge(all_data, on=['local_date', 'item_id'], how='inner').drop_duplicates(subset=['user_id', 'local_date', 'item_id'])

In [74]:
columns = ['name_firstword', 'name_fstsecword',
       'price', 'quantity', 'count_grp_x_x', 'quantity_sum_grp_x_x',
       'price_mean_x_x', 'count_grp_7dago_x_x', 'quantity_sum_7dago_x_x',
       'price_mean_7dago_x_x', 'count_grp_1week_x_x', 'quantity_sum_1week_x_x',
       'price_mean_1week_x_x', 'count_grp_2week_x_x', 'quantity_sum_2week_x_x',
       'price_mean_2week_x_x', 'count_grp_31days_x_x',
       'quantity_sum_31days_x_x', 'price_mean_31days_x_x', 'item_trend_q7d',
       'item_trend_c7d', 'item_trend_qg7d', 'item_trend_c1w2w',
       'item_trend_c2w31d', 'item_trend_p7d', 'item_trend_p1w',
       'item_trend_p2w', 'item_trend_p31d', 'item_trend_p1w31d',
       'count_grp_x_y', 'quantity_sum_grp_x_y',
       'price_mean_x_y', 'count_grp_7dago_x_y', 'quantity_sum_7dago_x_y',
       'price_mean_7dago_x_y', 'count_grp_1week_x_y', 'quantity_sum_1week_x_y',
       'price_mean_1week_x_y', 'count_grp_2week_x_y', 'quantity_sum_2week_x_y',
       'price_mean_2week_x_y', 'count_grp_31days_x_y',
       'quantity_sum_31days_x_y', 'price_mean_31days_x_y', 'ctgr_trend_c1w',
       'ctgr_trend_c2w', 'ctgr_trend_c31d', 'ctgr_trend_q1w', 'ctgr_trend_q2w',
       'ctgr_trend_q31d', 'ctgr_share_c', 'ctgr_share_q', 'ctgr_share_c7d',
       'ctgr_share_q7d', 'ctgr_share_c1w', 'ctgr_share_q1w', 'ctgr_share_c2w',
       'ctgr_share_q2w', 'ctgr_share_c31d', 'ctgr_share_q31d', 'count_grp_x',
       'quantity_sum_grp_x', 'price_mean_x', 'count_grp_7dago_x',
       'quantity_sum_7dago_x', 'price_mean_7dago_x', 'count_grp_1week_x',
       'quantity_sum_1week_x', 'price_mean_1week_x', 'count_grp_2week_x',
       'quantity_sum_2week_x', 'price_mean_2week_x', 'count_grp_31days_x',
       'quantity_sum_31days_x', 'price_mean_31days_x', 'ctgr2_trend_c1w',
       'ctgr2_trend_c2w', 'ctgr2_trend_c31d', 'ctgr2_trend_q1w',
       'ctgr2_trend_q2w', 'ctgr2_trend_q31d', 'ctgr2_share_c', 'ctgr2_share_q',
       'ctgr2_share_c7d', 'ctgr2_share_q7d', 'ctgr2_share_c1w',
       'ctgr2_share_q1w', 'ctgr2_share_c2w', 'ctgr2_share_q2w',
       'ctgr2_share_c31d', 'ctgr2_share_q31d']

In [75]:
neg = neg.merge(interactions[['user_id', 'local_date']], on='user_id', how="inner").drop_duplicates()

In [76]:
neg = neg[neg['item_id']>=0.0]

In [77]:
neg['local_date'] = pd.to_datetime(neg['local_date'], format='%Y-%m-%d', errors='ignore')
neg['item_id'] = neg['item_id'].astype(np.int64)

In [78]:
neg = neg.sort_values(by=['local_date'])

In [79]:
neg = pd.merge_asof(neg, all_data, on="local_date", by='item_id', direction='backward')

In [80]:
ctb_train_users, ctb_test_users = train_test_split(candidates,
                                                       random_state=1,
                                                       test_size=0.2)

ctb_train_users, ctb_eval_users = train_test_split(ctb_train_users,
                                                       random_state=1,
                                                       test_size=0.1)

In [81]:
select_col = ['user_id', 'item_id', 'rank_popular', 'rank_cos', 'device_number', 'rank_cooc', 'name_y', \
'target'] + embedding_columns + columns

    # Catboost train
ctb_train = shuffle(
        pd.concat([
            pos[pos['user_id'].isin(ctb_train_users['user_id'])],
            neg[neg['user_id'].isin(ctb_train_users['user_id'])]
        ])[select_col]
    )

    # Catboost test
ctb_test = shuffle(
        pd.concat([
            pos[pos['user_id'].isin(ctb_test_users['user_id'])],
            neg[neg['user_id'].isin(ctb_test_users['user_id'])]
        ])[select_col]
    )

    # for early stopping
ctb_eval = shuffle(
        pd.concat([
            pos[pos['user_id'].isin(ctb_eval_users['user_id'])],
            neg[neg['user_id'].isin(ctb_eval_users['user_id'])]
        ])[select_col]
    )


In [82]:
drop_cols = ['target', 'item_id', 'user_id']
X_train_catboost, y_train_catboost = ctb_train.drop(drop_cols, axis=1), ctb_train['target']
X_eval_catboost, y_eval_catboost = ctb_eval.drop(drop_cols, axis=1), ctb_eval['target']
X_test_catboost, y_test_catboost = ctb_test.drop(drop_cols, axis=1), ctb_test['target']

est_params = {
        'subsample': 0.9,
        'max_depth': 5,
        'n_estimators': 2000,
        'learning_rate': 0.1,
        'thread_count': 20,
        'random_state': 42,
        'verbose': 200,
    }

In [83]:
X_train_catboost[['name_y', 'name_firstword', 'name_fstsecword']] = X_train_catboost[['name_y', 'name_firstword', 'name_fstsecword']].astype(str)

In [84]:
X_train_catboost[['name_y', 'name_firstword', 'name_fstsecword']] = X_train_catboost[['name_y', 'name_firstword', 'name_fstsecword']].fillna(' ')

In [85]:
X_train_catboost['name_y']

88122     Сметана Молочный гостинец гр пл ст ГП Молочный...
1691      Морож Фабрика мороженого Славестино гр эскимо ...
45016                                                   nan
997684                                                  nan
79775                                     Хлеб Дарницкий гр
                                ...                        
7799                                                    nan
12724                 Вермишель б п Экспресс гр курица м уп
23546                                                   nan
74045                          Мак изд Знатные Рожки рифл В
77679                К ты Желейные глаз апельсиновые Яшкино
Name: name_y, Length: 154552, dtype: object

In [86]:
X_eval_catboost[['name_y', 'name_firstword', 'name_fstsecword']] = X_eval_catboost[['name_y', 'name_firstword', 'name_fstsecword']].astype(str)

In [87]:
X_train_catboost.drop(columns=['name_firstword', 'name_fstsecword'], inplace=True)
X_eval_catboost.drop(columns=['name_firstword', 'name_fstsecword'], inplace=True)

In [88]:
ctb_model = CatBoostClassifier(**est_params)
cat_col = ['device_number']
ctb_model.fit(X_train_catboost,
                y_train_catboost,
                eval_set=(X_eval_catboost, y_eval_catboost),
                early_stopping_rounds=100,
                cat_features=cat_col,
                text_features=['name_y'],
                plot=False)

preds = ctb_model.predict(X_test_catboost)
print(accuracy_score(y_test_catboost, preds))




0:	learn: 0.4654153	test: 0.4654151	best: 0.4654151 (0)	total: 333ms	remaining: 11m 4s


In [150]:
ctb_prediction = ctb_model.predict_proba(X_test_catboost.drop(drop_cols, axis=1, errors='ignore'))
X_test_catboost['ctb_pred'] = ctb_prediction[:, 1]

CatBoostError: There is no trained model to use predict_proba(). Use fit() to train model. Then use this method.

In [None]:
candidates = candidates.sort_values(
     by=['user_id', 'ctb_pred'], ascending=[True, False])
candidates['rank_ctb'] = candidates.groupby('user_id').cumcount() + 1

In [None]:
def compute_metrics(df_true, df_pred, top_N, rank_col):
    result = {}
    test_recs = df_true.set_index(['user_id', 'item_id']).join(df_pred.set_index(['user_id', 'item_id']))
    test_recs = test_recs.sort_values(by=['user_id', rank_col])

    test_recs['users_item_count'] = test_recs.groupby(level='user_id')[rank_col].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs[rank_col]).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs[rank_col]

    users_count = test_recs.index.get_level_values('user_id').nunique()
    for k in range(1, top_N + 1):
        hit_k = f'hit@{k}'
        test_recs[hit_k] = test_recs[rank_col] <= k
        result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
        result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count

    result[f'MAP@{top_N}'] = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
    result[f'MRR'] = test_recs.groupby(level='user_id')['reciprocal_rank'].max().mean()
    return pd.Series(result)

In [None]:
compute_metrics(y_test, candidates[['user_id', 'item_id', 'rank_ctb']], 10, rank_col='rank_ctb')

In [175]:
cosin_model_val = COSIN(interactions_val, top_N=50)

In [176]:
cosin_model_val.train()



  0%|          | 0/11298 [00:00<?, ?it/s]

In [177]:
pops_table_test = popularmakePreds(popular_model, interactions_val, 50)

In [178]:
coon_candidates_test = coocmakePreds(cooc_models, interactions_val, 50)

In [179]:
preds = cosin_model_val.predict(interactions_val['user_id'].unique())
preds = preds[['users', 'rnk', 'items']]
preds.columns = ['user_id', 'rank_cos', 'item_id']

In [180]:
candidates_test = pd.merge(coon_candidates_test, pops_table_test, on=['user_id', 'item_id'], how='outer')
candidates_test = pd.merge(candidates_test, preds, on=['user_id', 'item_id'], how='outer')

In [181]:
candidates_test, _ = add_embeddings(candidates_test)

In [182]:
candidates_test = add_devices(candidates_test, interactions_val)

In [183]:
candidates_test = candidates_test.merge(interactions_val[['user_id', 'local_date']], on='user_id', how="inner").drop_duplicates()

In [185]:
candidates_test = candidates_test[candidates_test['item_id']>=0.0]

In [186]:
candidates_test['local_date'] = pd.to_datetime(candidates_test['local_date'], format='%Y-%m-%d', errors='ignore')
candidates_test['item_id'] = candidates_test['item_id'].astype(np.int64)
candidates_test = candidates_test.sort_values(by=['local_date'])

In [187]:
candidates_test = pd.merge_asof(candidates_test, all_data, on="local_date", by='item_id', direction='backward')

In [192]:
candidates_test.rename(columns={'name':'name_y'}, inplace=True)

In [193]:
candidates_test_2 = candidates_test[list(X_train_catboost.columns) + ['item_id', 'user_id']]

In [195]:
candidates_test_2['name_y'] = candidates_test_2['name_y'].astype(str)

In [196]:
ctb_prediction_test = ctb_model.predict_proba(candidates_test_2.drop(['item_id', 'user_id'], axis=1))
candidates_test_2['ctb_pred'] = ctb_prediction_test[:, 1]

: 

In [None]:
candidates_test_2 = candidates_test_2.sort_values(
     by=['user_id', 'ctb_pred'], ascending=[True, False])
candidates_test_2['rank_ctb'] = candidates_test_2.groupby('user_id').cumcount() + 1

In [None]:
interactions_val_target.columns = ['user_id', 'item_id']

In [None]:
interactions_val.to_csv('val_dataset.csv')

In [None]:
compute_metrics(interactions_val_target, candidates_test_2[['user_id', 'item_id', 'rank_ctb']], 10, rank_col='rank_ctb')

In [None]:
candidates_test_2[['user_id', 'item_id', 'rank_ctb']].loc[candidates_test_2['rank_ctb'] == 1].to_csv("val_predictions.csv")

In [None]:
firsts = candidates_test_2[['user_id', 'item_id', 'rank_ctb']].loc[candidates_test_2['rank_ctb'] == 1].user_id

In [None]:
interactions_val_target.user_id.values

In [None]:
for i in firsts:
    real = interactions_val_target.loc[interactions_val_target['receipt_id'] == i]
    pred = candidates_test_2.loc[candidates_test_2['user_id'] == i]
    if real['item_id'].values[0] == pred['item_id'].values[0]:
        print(i)