In [1]:
import pandas as pd

from utils import *
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
from lightfm import LightFM
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from recbole.utils import init_logger, get_model, init_seed
import pickle
from logging import getLogger
import torch
from recbole.data import create_dataset, data_preparation
from models import *
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
import scipy.sparse as sp



In [2]:
class COSIN:

    def __init__(self, data, top_N=10, mdl_params={}, filter_already_liked_items=True):
        self.data=data
        self.model = CosineRecommender(**mdl_params)
        self.filter_already_liked_items = filter_already_liked_items
        self.top_N = top_N
        self.users_inv_mapping = dict(enumerate(data['user_id'].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}
        self.items_inv_mapping = dict(enumerate(data['item_id'].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}

    def get_coo_matrix(self, df,
                       user_col='user_id',
                       item_col='item_id',
                       weight_col=None):
        if weight_col is None:
            weights = np.ones(len(df), dtype=np.float32)
        else:
            weights = df[weight_col].astype(np.float32)
        interaction_matrix = sp.coo_matrix((
            weights,
            (
                df[user_col].map(self.users_mapping),
                df[item_col].map(self.items_mapping)
            )
        ))
        return interaction_matrix

    def train(self):
        # preprocess matrix
        self.train_mat = self.get_coo_matrix(df = self.data).tocsr()
        self.model.fit(self.train_mat)
        return

 

    def predict(self, users):
        userids = [self.users_mapping[u] for u in users]
        recs, scores = self.model.recommend(userids, self.train_mat[userids], self.top_N,
                               filter_already_liked_items=self.filter_already_liked_items)

        a = pd.DataFrame({'users':users, 'item_id':(r for r in recs)})
        a = a.explode('item_id').reset_index(drop=True)
        a['rnk'] = a.groupby('users').cumcount()+1
        a['items'] = a['item_id'].map(self.items_inv_mapping)
        return a

In [3]:
def add_embeddings(df):
    def embeddings_giver(row):
        if str(row) in mapper['item_id']:
            internal_index = mapper['item_id'][str(row)]
            emb = embeddings[:][internal_index]
        else:
            emb = np.ones(64)
        return emb
    column_names = []
    df['embedding'] = df['item_id'].apply(embeddings_giver)
    for i in range(64):
        column_names.append(f"column_{i + 1}")

    # Now you can assign values to these new columns based on your needs, for example:
    # Let's say you want to fill the new columns with values from the 'embedding' column
    df[column_names] = pd.DataFrame(df.embedding.tolist(), index= df.index)
    df.drop('embedding', axis=1, inplace=True)
    return df, column_names

In [6]:
interactions = pd.read_table('leaderboard/gardening_train.tsv')
interactions_val = pd.read_table('leaderboard/gardening_test.tsv')

In [177]:
all_data = pd.read_csv('leaderboard_new_feats.csv', index_col='Unnamed: 0')

In [180]:
interactions.rename(columns={'receipt_id':'user_id'}, inplace=True)

In [181]:
devices_dict = {}
devices = interactions.device_id.unique()
for i in range(0, len(devices)):
    devices_dict[devices[i]] = i

In [182]:
def add_devices(df, father_df):
    def device_mapper(user_id):
        device = father_df.loc[father_df['user_id'] == user_id, 'device_id'].values[0]
        if device in devices_dict:
            return devices_dict[device]
        else:
            return -1
    df['device_number'] = df['user_id'].apply(device_mapper)
    return df


In [183]:
cosin_model = COSIN(interactions, top_N=50)

In [184]:
cosin_model.train()



  0%|          | 0/12767 [00:00<?, ?it/s]

In [185]:
cos_candidates = cosin_model.predict(interactions['user_id'].unique())

In [186]:
cos_candidates = cos_candidates[['users', 'rnk', 'items']]

In [187]:
cos_candidates.columns = ['user_id', 'rank_cos', 'item_id']

In [188]:
cos_candidates['item_id'] = cos_candidates['item_id']

In [189]:
cos_candidates['item_id']

0         308788
1         300482
2         312268
3         308264
4         309245
           ...  
458545    305594
458546    313827
458547    309583
458548    305474
458549    308826
Name: item_id, Length: 458550, dtype: object

In [190]:
popular_model = train_popular_model(interactions)
popular_candidates = popularmakePreds(popular_model, interactions, 50)

cooc_models = {}
cooc_reccomender = train_cooc_model(train)
for device_id in train['device_id'].unique():
    cooc_models[device_id] = CoocurenceRecommender(cooc_reccomender[device_id])

100%|██████████| 15/15 [00:00<00:00, 76.40it/s]


In [48]:
X_test['user_id'].nunique(), X_test['user_id'].nunique() * 50

(3669, 183450)

In [191]:
coon_candidates = coocmakePreds(cooc_models, interactions, 50)

In [53]:
popular_candidates.dtypes

user_id         object
item_id         object
rank_popular     int64
dtype: object

In [192]:
coon_candidates['item_id'] = coon_candidates['item_id'].astype(str)

In [193]:
candidates = pd.merge(coon_candidates, popular_candidates, on=['user_id', 'item_id'], how='outer')
candidates = pd.merge(candidates, cos_candidates, on=['user_id', 'item_id'], how='outer')

In [194]:
candidates

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos
0,10429502812,302930,1.0,,
1,10827475736,302710,1.0,,
2,10827475736,303178,2.0,,
3,10827475736,302357,3.0,,
4,10827475736,303870,4.0,12.0,
...,...,...,...,...,...
915367,16206741706,305594,,,46.0
915368,16206741706,313827,,,47.0
915369,16206741706,309583,,,48.0
915370,16206741706,305474,,,49.0


In [195]:
all_data.drop('local_date', axis=1, inplace=True)

In [196]:
all_data = all_data.rename(columns={'local_dt':'local_date', 'receipt_id':'user_id'})

In [197]:
y_test['target'] = 1

In [198]:
items_df = pd.DataFrame({'item_id': interactions['item_id'].unique()})

In [199]:
items_nan = items_df[~items_df['item_id'].isin(candidates['item_id'])]

In [200]:
user_fake = {}
for user in X_test['user_id'].unique():
    user_fake[user] = list(items_nan.sample(15)['item_id'].values)

In [201]:
user_fake_df = pd.DataFrame(list(user_fake.items()))

In [202]:
user_fake_df.columns = ['user_id', 'item_id']
user_fake_df = user_fake_df.explode('item_id')

In [203]:
user_fake_df['rank_cooc'] = np.nan
user_fake_df['rank_popular'] = np.nan
user_fake_df['rank_cos'] = np.nan

In [204]:
user_fake_df

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos
0,10007227728,308734,,,
0,10007227728,303737,,,
0,10007227728,309368,,,
0,10007227728,301741,,,
0,10007227728,309862,,,
...,...,...,...,...,...
3668,9995440514,310808,,,
3668,9995440514,309645,,,
3668,9995440514,310313,,,
3668,9995440514,310689,,,


In [205]:
candidates = pd.concat([candidates, user_fake_df])

In [206]:
candidates = pd.merge(candidates, y_test, on=['user_id', 'item_id'], how='outer')
candidates

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos,target
0,10429502812,302930,1.0,,,
1,10827475736,302710,1.0,,,
2,10827475736,303178,2.0,,,
3,10827475736,302357,3.0,,,
4,10827475736,303870,4.0,12.0,,
...,...,...,...,...,...,...
973096,9642569548,307814,,,,1.0
973097,9691433735,305139,,,,1.0
973098,9824154515,308414,,,,1.0
973099,9849397170,301961,,,,1.0


In [207]:
# Remove any remaining non-Russian characters (if needed)
import re
def keep_only_russian(text):
    russian_pattern = re.compile('[А-Яа-я]+')
    try:
        answer = ' '.join(russian_pattern.findall(text))
    except:
        return 'Не опознан'
    return answer



In [208]:
candidates

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos,target
0,10429502812,302930,1.0,,,
1,10827475736,302710,1.0,,,
2,10827475736,303178,2.0,,,
3,10827475736,302357,3.0,,,
4,10827475736,303870,4.0,12.0,,
...,...,...,...,...,...,...
973096,9642569548,307814,,,,1.0
973097,9691433735,305139,,,,1.0
973098,9824154515,308414,,,,1.0
973099,9849397170,301961,,,,1.0


In [209]:
candidates['target'].fillna(0, inplace=True)

In [210]:
candidates.target.value_counts()

target
0.0    969432
1.0      3669
Name: count, dtype: int64

In [211]:
pos = candidates[candidates['target']==1]
pos

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos,target
467800,11188696314,311914,,,2.0,1.0
467932,11199755052,303207,,,35.0,1.0
469167,11853229509,311235,,,38.0,1.0
469858,11984346378,304603,,,37.0,1.0
470182,13003621397,305808,,,20.0,1.0
...,...,...,...,...,...,...
973096,9642569548,307814,,,,1.0
973097,9691433735,305139,,,,1.0
973098,9824154515,308414,,,,1.0
973099,9849397170,301961,,,,1.0


In [212]:
neg = candidates[candidates['target']==0].sample(int(50_000))
# neg = pd.merge(neg, items_df, on='item_id', how="left")
# neg['name'] = neg['name'].apply(keep_only_russian)
neg

Unnamed: 0,user_id,item_id,rank_cooc,rank_popular,rank_cos,target
431303,15949514733,305832,,37.0,,0.0
403433,14646963180,307136,,40.0,,0.0
207680,11790262328,301058,,13.0,,0.0
865032,12863257021,305492,,,41.0,0.0
317475,11715537128,300590,,11.0,,0.0
...,...,...,...,...,...,...
333844,14877351842,312333,,14.0,,0.0
382064,14963050789,305803,,47.0,,0.0
719104,14617516774,301018,,,28.0,0.0
660287,11612286005,,,,19.0,0.0


In [213]:
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [82]:
neg, embedding_columns = add_embeddings(neg)

In [85]:
pos, _ = add_embeddings(pos)

In [217]:
neg = add_devices(neg, interactions)

In [None]:
pos = add_devices(pos, interactions)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [85]:
all_data.drop(columns=['device_id', 'user_id'], inplace=True)

In [86]:
pos = pos.merge(interactions[['user_id', 'local_date']], on='user_id', how="inner").drop_duplicates()

In [87]:
all_data['item_id'] = all_data['item_id'].astype(np.int64)
pos['item_id'] = pos['item_id'].astype(np.int64)

In [90]:
pos['local_date'] = pd.to_datetime(pos['local_date'], format='%Y-%m-%d', errors='ignore')
all_data['local_date'] = pd.to_datetime(all_data['local_date'], format='%Y-%m-%d', errors='ignore')
pos = pos.sort_values(by=['local_date'])
all_data = all_data.sort_values(by=['local_date'])

In [103]:
pos['local_date'].str[:10]

31645    2021-09-01
4095     2021-09-03
31652    2021-09-07
31662    2021-09-08
31666    2021-09-10
            ...    
31601    2023-08-31
31603    2023-08-31
31608    2023-08-31
31614    2023-08-31
31622    2023-08-31
Name: local_date, Length: 3669, dtype: object

In [104]:
pos['local_date'] = pd.to_datetime(pos['local_date'].str[:10], format='%Y-%m-%d')

In [105]:
pos = pos.merge(all_data, on=['local_date', 'item_id'], how='inner').drop_duplicates(subset=['user_id', 'local_date', 'item_id'])

In [122]:
columns = ['name_firstword', 'name_fstsecword',
       'price', 'quantity', 'count_grp_x_x', 'quantity_sum_grp_x_x',
       'price_mean_x_x', 'count_grp_7dago_x_x', 'quantity_sum_7dago_x_x',
       'price_mean_7dago_x_x', 'count_grp_1week_x_x', 'quantity_sum_1week_x_x',
       'price_mean_1week_x_x', 'count_grp_2week_x_x', 'quantity_sum_2week_x_x',
       'price_mean_2week_x_x', 'count_grp_31days_x_x',
       'quantity_sum_31days_x_x', 'price_mean_31days_x_x', 'item_trend_q7d',
       'item_trend_c7d', 'item_trend_qg7d', 'item_trend_c1w2w',
       'item_trend_c2w31d', 'item_trend_p7d', 'item_trend_p1w',
       'item_trend_p2w', 'item_trend_p31d', 'item_trend_p1w31d','count_grp_x_y', 'quantity_sum_grp_x_y',
       'price_mean_x_y', 'count_grp_7dago_x_y', 'quantity_sum_7dago_x_y',
       'price_mean_7dago_x_y', 'count_grp_1week_x_y', 'quantity_sum_1week_x_y',
       'price_mean_1week_x_y', 'count_grp_2week_x_y', 'quantity_sum_2week_x_y',
       'price_mean_2week_x_y', 'count_grp_31days_x_y',
       'quantity_sum_31days_x_y', 'price_mean_31days_x_y', 'ctgr_trend_c1w',
       'ctgr_trend_c2w', 'ctgr_trend_c31d', 'ctgr_trend_q1w', 'ctgr_trend_q2w',
       'ctgr_trend_q31d', 'ctgr_share_c', 'ctgr_share_q', 'ctgr_share_c7d',
       'ctgr_share_q7d', 'ctgr_share_c1w', 'ctgr_share_q1w', 'ctgr_share_c2w',
       'ctgr_share_q2w', 'ctgr_share_c31d', 'ctgr_share_q31d', 'count_grp_x',
       'quantity_sum_grp_x', 'price_mean_x', 'count_grp_7dago_x',
       'quantity_sum_7dago_x', 'price_mean_7dago_x', 'count_grp_1week_x',
       'quantity_sum_1week_x', 'price_mean_1week_x', 'count_grp_2week_x',
       'quantity_sum_2week_x', 'price_mean_2week_x', 'count_grp_31days_x',
       'quantity_sum_31days_x', 'price_mean_31days_x', 'ctgr2_trend_c1w',
       'ctgr2_trend_c2w', 'ctgr2_trend_c31d', 'ctgr2_trend_q1w',
       'ctgr2_trend_q2w', 'ctgr2_trend_q31d', 'ctgr2_share_c', 'ctgr2_share_q',
       'ctgr2_share_c7d', 'ctgr2_share_q7d', 'ctgr2_share_c1w',
       'ctgr2_share_q1w', 'ctgr2_share_c2w', 'ctgr2_share_q2w',
       'ctgr2_share_c31d', 'ctgr2_share_q31d']

In [107]:
neg = neg.merge(interactions[['user_id', 'local_date']], on='user_id', how="inner").drop_duplicates()

In [115]:
neg = neg[neg['item_id'].notnull()]

In [116]:
neg['item_id'] = neg['item_id'].astype(np.int64)

In [117]:
neg['local_date'] = pd.to_datetime(neg['local_date'], format='%Y-%m-%d', errors='ignore')

In [118]:
neg = neg.sort_values(by=['local_date'])

In [119]:
neg = pd.merge_asof(neg, all_data, on="local_date", by='item_id', direction='backward')

In [120]:
ctb_train_users, ctb_test_users = train_test_split(candidates,
                                                       random_state=1,
                                                       test_size=0.2)

ctb_train_users, ctb_eval_users = train_test_split(ctb_train_users,
                                                       random_state=1,
                                                       test_size=0.1)

In [123]:
select_col = ['user_id', 'item_id', 'rank_popular', 'rank_cos', 'device_number', 'rank_cooc', 'name', \
'target'] + columns

    # Catboost train
ctb_train = shuffle(
        pd.concat([
            pos[pos['user_id'].isin(ctb_train_users['user_id'])],
            neg[neg['user_id'].isin(ctb_train_users['user_id'])]
        ])[select_col]
    )

    # Catboost test
ctb_test = shuffle(
        pd.concat([
            pos[pos['user_id'].isin(ctb_test_users['user_id'])],
            neg[neg['user_id'].isin(ctb_test_users['user_id'])]
        ])[select_col]
    )

    # for early stopping
ctb_eval = shuffle(
        pd.concat([
            pos[pos['user_id'].isin(ctb_eval_users['user_id'])],
            neg[neg['user_id'].isin(ctb_eval_users['user_id'])]
        ])[select_col]
    )


In [124]:
drop_cols = ['target', 'item_id', 'user_id']
X_train_catboost, y_train_catboost = ctb_train.drop(drop_cols, axis=1), ctb_train['target']
X_eval_catboost, y_eval_catboost = ctb_eval.drop(drop_cols, axis=1), ctb_eval['target']
X_test_catboost, y_test_catboost = ctb_test.drop(drop_cols, axis=1), ctb_test['target']

est_params = {
        'subsample': 0.9,
        'max_depth': 5,
        'n_estimators': 2000,
        'learning_rate': 0.1,
        'thread_count': 20,
        'random_state': 42,
        'verbose': 200,
    }

In [126]:
X_train_catboost[['name', 'name_firstword', 'name_fstsecword']] = X_train_catboost[['name', 'name_firstword', 'name_fstsecword']].astype(str)

In [129]:
X_eval_catboost[['name', 'name_firstword', 'name_fstsecword']] = X_eval_catboost[['name', 'name_firstword', 'name_fstsecword']].fillna(' ')

In [131]:
X_train_catboost.drop(columns=['name_firstword', 'name_fstsecword'], inplace=True)
X_eval_catboost.drop(columns=['name_firstword', 'name_fstsecword'], inplace=True)

In [132]:
ctb_model = CatBoostClassifier(**est_params)
cat_col = ['device_number']
ctb_model.fit(X_train_catboost,
                y_train_catboost,
                eval_set=(X_eval_catboost, y_eval_catboost),
                early_stopping_rounds=100,
                cat_features=cat_col,
                text_features=['name'],
                plot=False)

preds = ctb_model.predict(X_test_catboost)
print(accuracy_score(y_test_catboost, preds))

0:	learn: 0.5922627	test: 0.5922599	best: 0.5922599 (0)	total: 137ms	remaining: 4m 34s
200:	learn: 0.1181761	test: 0.1181897	best: 0.1181897 (200)	total: 18.3s	remaining: 2m 44s
400:	learn: 0.1067482	test: 0.1068078	best: 0.1068078 (400)	total: 35.5s	remaining: 2m 21s
600:	learn: 0.0997039	test: 0.0999478	best: 0.0999478 (600)	total: 51.6s	remaining: 2m
800:	learn: 0.0939722	test: 0.0944418	best: 0.0944418 (800)	total: 1m 8s	remaining: 1m 43s
1000:	learn: 0.0886496	test: 0.0893961	best: 0.0893961 (1000)	total: 1m 26s	remaining: 1m 26s
1200:	learn: 0.0842112	test: 0.0851809	best: 0.0851809 (1200)	total: 1m 44s	remaining: 1m 9s
1400:	learn: 0.0801171	test: 0.0812550	best: 0.0812550 (1400)	total: 2m 3s	remaining: 52.7s
1600:	learn: 0.0762976	test: 0.0776817	best: 0.0776817 (1600)	total: 2m 25s	remaining: 36.3s
1800:	learn: 0.0730303	test: 0.0746218	best: 0.0746005 (1799)	total: 2m 43s	remaining: 18.1s
1999:	learn: 0.0700371	test: 0.0717961	best: 0.0717961 (1999)	total: 3m	remaining: 0us



CatBoostError: Invalid type for text_feature[non-default value idx=1,feature_idx=4]=nan : text_features must have string type

In [150]:
ctb_prediction = ctb_model.predict_proba(X_test_catboost.drop(drop_cols, axis=1, errors='ignore'))
X_test_catboost['ctb_pred'] = ctb_prediction[:, 1]

CatBoostError: There is no trained model to use predict_proba(). Use fit() to train model. Then use this method.

In [134]:
def compute_metrics(df_true, df_pred, top_N, rank_col):
    result = {}
    test_recs = df_true.set_index(['user_id', 'item_id']).join(df_pred.set_index(['user_id', 'item_id']))
    test_recs = test_recs.sort_values(by=['user_id', rank_col])

    test_recs['users_item_count'] = test_recs.groupby(level='user_id')[rank_col].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs[rank_col]).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='user_id').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs[rank_col]

    users_count = test_recs.index.get_level_values('user_id').nunique()
    for k in range(1, top_N + 1):
        hit_k = f'hit@{k}'
        test_recs[hit_k] = test_recs[rank_col] <= k
        result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
        result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count

    result[f'MAP@{top_N}'] = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
    result[f'MRR'] = test_recs.groupby(level='user_id')['reciprocal_rank'].max().mean()
    return pd.Series(result)

In [None]:
compute_metrics(y_test, candidates[['user_id', 'item_id', 'rank_ctb']], 10, rank_col='rank_ctb')

In [137]:
interactions_val.rename(columns={'receipt_id':'user_id'}, inplace=True)

In [138]:
cosin_model_val = COSIN(interactions_val, top_N=50)

In [139]:
cosin_model_val.train()



  0%|          | 0/7979 [00:00<?, ?it/s]

In [143]:
popular_model = train_popular_model(interactions_val)

In [144]:
pops_table_test = popularmakePreds(popular_model, interactions_val, 50)

In [149]:
cooc_models = {}
cooc_reccomender = train_cooc_model(interactions_val)
for device_id in interactions_val['device_id'].unique():
    cooc_models[device_id] = CoocurenceRecommender(cooc_reccomender[device_id])

100%|██████████| 15/15 [00:00<00:00, 232.89it/s]


In [150]:
coon_candidates_test = coocmakePreds(cooc_models, interactions_val, 50)

In [151]:
preds = cosin_model_val.predict(interactions_val['user_id'].unique())
preds = preds[['users', 'rnk', 'items']]
preds.columns = ['user_id', 'rank_cos', 'item_id']

In [152]:
candidates_test = pd.merge(coon_candidates_test, pops_table_test, on=['user_id', 'item_id'], how='outer')
candidates_test = pd.merge(candidates_test, preds, on=['user_id', 'item_id'], how='outer')

In [181]:
candidates_test, _ = add_embeddings(candidates_test)

In [153]:
candidates_test = add_devices(candidates_test, interactions_val)

In [154]:
candidates_test = candidates_test.merge(interactions_val[['user_id', 'local_date']], on='user_id', how="inner").drop_duplicates()

In [156]:
candidates_test = candidates_test[candidates_test['item_id'].notnull()]

In [158]:
candidates_test['local_date'] = pd.to_datetime(candidates_test['local_date'], format='%Y-%m-%d', errors='ignore')
candidates_test['item_id'] = candidates_test['item_id'].astype(np.int64)
candidates_test = candidates_test.sort_values(by=['local_date'])

In [159]:
candidates_test = pd.merge_asof(candidates_test, all_data, on="local_date", by='item_id', direction='backward')

In [160]:
candidates_test_2 = candidates_test[list(X_train_catboost.columns) + ['item_id', 'user_id']]

In [162]:
candidates_test_2['name'] = candidates_test_2['name'].astype(str)

In [163]:
ctb_prediction_test = ctb_model.predict_proba(candidates_test_2.drop(['item_id', 'user_id'], axis=1))
candidates_test_2['ctb_pred'] = ctb_prediction_test[:, 1]

In [164]:
candidates_test_2 = candidates_test_2.sort_values(
     by=['user_id', 'ctb_pred'], ascending=[True, False])
candidates_test_2['rank_ctb'] = candidates_test_2.groupby('user_id').cumcount() + 1

In [166]:
candidates_test_2.to_csv('preds.csv')

In [167]:
candidates_test_2

Unnamed: 0,rank_popular,rank_cos,device_number,rank_cooc,name,price,quantity,count_grp_x_x,quantity_sum_grp_x_x,price_mean_x_x,...,ctgr2_share_c1w,ctgr2_share_q1w,ctgr2_share_c2w,ctgr2_share_q2w,ctgr2_share_c31d,ctgr2_share_q31d,item_id,user_id,ctb_pred,rank_ctb
50,1.0,,-1,,,,,,,,...,,,,,,,301962,9140868220,0.000295,1
0,2.0,,-1,,,,,,,,...,,,,,,,301966,9140868220,0.000157,2
51,,1.0,-1,,,,,,,,...,,,,,,,302628,9140868220,0.000155,3
26,3.0,,-1,,,,,,,,...,,,,,,,301963,9140868220,0.000120,4
72,,2.0,-1,,,,,,,,...,,,,,,,308317,9140868220,0.000079,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266294,23.0,,-1,,Абрикос Лель,899.0,1.0,1.0,1.0,899.0,...,,,,,,,300028,16205221758,0.001964,66
266329,,13.0,-1,,Кашпо Сантино ТЕРРА пластик см см л крем,149.0,6.0,1.0,6.0,149.0,...,0.002717,0.001038,0.001451,0.000668,0.001282,0.000624,303306,16205221758,0.001239,67
266332,,16.0,-1,,Вишня Десертная Морозовой,790.0,1.0,1.0,1.0,790.0,...,,,,,,,301069,16205221758,0.001085,68
266330,,14.0,-1,,Кашпо Сантино ТЕРРА пластик см см л крем,199.0,2.0,1.0,2.0,199.0,...,0.008658,0.008520,0.005533,0.003747,0.002342,0.002247,303308,16205221758,0.000982,69


In [171]:
sumb = pd.read_csv('leaderboard/submission.csv', sep=';')
sumb

Unnamed: 0,receipt_id,item_id
0,10829138212,306441
1,10982521349,306441
2,11060104065,306441
3,11207039921,306441
4,11215735350,306441
...,...,...
3050,16154505762,306441
3051,16183067597,306441
3052,16183496280,306441
3053,16193268732,306441


In [174]:
for index, row in sumb.iterrows():
    val = candidates_test_2.loc[(candidates_test_2['user_id'] == row['receipt_id']) & (candidates_test_2['rank_ctb'] == 1), 'item_id'].values[0]
    sumb.iloc[index]['item_id'] = val

In [176]:
sumb

Unnamed: 0,receipt_id,item_id
0,10829138212,311282
1,10982521349,304216
2,11060104065,302601
3,11207039921,312337
4,11215735350,311550
...,...,...
3050,16154505762,313229
3051,16183067597,306548
3052,16183496280,305669
3053,16193268732,313022


In [165]:
interactions_val_target.columns = ['user_id', 'item_id']

NameError: name 'interactions_val_target' is not defined

In [None]:
interactions_val.to_csv('val_dataset.csv')

In [None]:
compute_metrics(interactions_val_target, candidates_test_2[['user_id', 'item_id', 'rank_ctb']], 10, rank_col='rank_ctb')

In [None]:
candidates_test_2[['user_id', 'item_id', 'rank_ctb']].loc[candidates_test_2['rank_ctb'] == 1].to_csv("val_predictions.csv")

In [None]:
firsts = candidates_test_2[['user_id', 'item_id', 'rank_ctb']].loc[candidates_test_2['rank_ctb'] == 1].user_id

In [None]:
interactions_val_target.user_id.values

In [None]:
for i in firsts:
    real = interactions_val_target.loc[interactions_val_target['receipt_id'] == i]
    pred = candidates_test_2.loc[candidates_test_2['user_id'] == i]
    if real['item_id'].values[0] == pred['item_id'].values[0]:
        print(i)