In [1]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sp
from tqdm.auto import tqdm
from tqdm.contrib import tzip
import gc
from implicit.evaluation import mean_average_precision_at_k,train_test_split
from implicit.approximate_als import FaissAlternatingLeastSquares
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender,TFIDFRecommender
from implicit.gpu.bpr import BayesianPersonalizedRanking
from implicit.gpu.als import AlternatingLeastSquares

In [2]:
#df = pd.read_csv('./vseros-rank-otb/train_df.tsv',sep='\t')
df = pd.read_parquet('all_train_df.parquet')
df

Unnamed: 0.1,Unnamed: 0,community_id,description,customers_count,messages_count,type,region_id,themeid,business_category,business_parent,customer_id,status,join_request_date,count_db,pair
15051500,15051500,d501adec049a83d7e968b9836d55d212e952e157fe4750...,False,6429,1,7,1.040205e+10,234.0,CARS_MACHINERY,AUTO,1a4bd984b1a1c6ef217dafdbd4c69734c2ecc785ae87a2...,I,,1.0,1a4bd984b1a1c6ef217dafdbd4c69734c2ecc785ae87a2...
8983316,8983316,2e1b38e5b2798ad629fbcdc65e753ac381a50e7ab0b059...,False,50674,1768,7,,,COOKING_AND_RECIPES,FOOD,fe11157c925c62cd0538220d613a4bc450a064cb83d31a...,A,,2.0,fe11157c925c62cd0538220d613a4bc450a064cb83d31a...
7000208,7000208,992a757cb164d3f767295af497039ad6cc3e3fa091b42a...,False,6030,1,7,,,COOKING_AND_RECIPES,FOOD,1bd41531bace0d2d228797f2e0bfde305ba34f7f5920a7...,A,,2.0,1bd41531bace0d2d228797f2e0bfde305ba34f7f5920a7...
10065277,10065277,5189397d7058464aff8179b792dfda693c8cd9d09bd8b5...,True,322968,91,7,,392.0,COOKING_AND_RECIPES,FOOD,944f31e05522ec9b6ae05f6acac919009af9f2e4266018...,P,,1.0,944f31e05522ec9b6ae05f6acac919009af9f2e4266018...
13824793,13824793,56d54d70c65fae5f81f8db9267a3b58698ad18ab070831...,False,1681,2,7,1.039461e+10,,OTHER_SERVICES,PROFESSIONAL_SERVICES,83d52f013852c463b478507f2397656aba59d666c3884e...,P,,1.0,83d52f013852c463b478507f2397656aba59d666c3884e...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4629626,4629626,7cef175a66c2828408b3494be61859815a5891ff08ad5a...,False,216704,8,7,,360.0,ANIMALS,PETS,bd7f7760f39a786e04944c635be64537e4438f9acae776...,A,,,bd7f7760f39a786e04944c635be64537e4438f9acae776...
12215531,12215531,ec6d4504e84cfe6bc7121ae83024ea984aa7b697270395...,False,3585,25,7,,,,,884c03aa3d91fcedb8ecbdc0e61ede4d5fc8d6f99f28fe...,P,,1.0,884c03aa3d91fcedb8ecbdc0e61ede4d5fc8d6f99f28fe...
7845282,7845282,d68731c0774cf60f22bda8cec34f7f9741a01616b91b85...,False,68018,88,7,,701.0,COOKING_AND_RECIPES,FOOD,6fedcce641b6ee331ffddac65c74c171c383d0cd0c2bc4...,A,,,6fedcce641b6ee331ffddac65c74c171c383d0cd0c2bc4...
5931407,5931407,4136286b625838fb6a5de0983288a924d0a184056865c9...,False,7769,9,7,,705.0,MUSIC,BLOG,2b09ccbbbb1bc3f3724b5018dee42ee0f7c5597ca588d8...,A,,,2b09ccbbbb1bc3f3724b5018dee42ee0f7c5597ca588d8...


In [3]:
users_inv_mapping = dict(enumerate(df['customer_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(df['community_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}
len(users_mapping),len(items_mapping)

(1568446, 149114)

In [None]:
test_idx = np.load('./test_index.npy')
train_gbm_idx = np.load('./train_gbm_index.npy')
train_idx = list(set(df.index) - (set(test_idx) | set(train_gbm_idx)) )

In [None]:
test_df = df.loc[test_idx]
train_df_gbm = df.loc[train_gbm_idx]
train_df = df.loc[train_idx]

In [None]:
train_df['weight'] = train_df['customers_count'].map(lambda x:x**(0.33))

In [None]:
del df
gc.collect()

In [22]:
def get_coo_matrix(df,
                   user_col='user_id',
                   item_col='item_id',
                   weight_col=None,
                   users_mapping=None,
                   items_mapping=None):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)

    interaction_matrix = sp.coo_matrix((
        weights,
        (
            df[user_col].map(users_mapping.get),
            df[item_col].map(items_mapping.get)
        )),
    )
    return interaction_matrix

In [23]:
train_mat = get_coo_matrix(df=train_df,
                           user_col='customer_id',
                           item_col='community_id',
                           weight_col='weight',
                           users_mapping=users_mapping,
                           items_mapping=items_mapping).tocsr()

In [24]:
def set_label(df,group):
    labels = []
    for user,item in tzip(df['user_id'],df['comunity']):
        if item in group.loc[user][0]:
            labels += [1]
        else:
            labels += [0]
    df['label'] = labels
    return df

In [48]:
class MultiImplicitModel():
    def __init__(self,models,maps,weights=[0.7,0.1,0.1,0.1]):
        self.models = models
        self.maps = maps
        self.weights = weights
        
    def fit(self,mat):
        self.mat = mat
        for model in tqdm(self.models):
            model.fit(self.mat)
            
    def predict_model(self,test_users,idx,N=20,falh=True):
        model = self.models[idx]
        recs,scores = [],[]
        for id in tqdm(test_users):
            row_id = self.maps[0][id]
            ranks = model.recommend(row_id,self.mat[row_id], N=N, filter_already_liked_items=falh)
            recs += [[self.maps[1].get(it) for it in ranks[0]]]
            scores += [ranks[1]]
        return recs,scores
    
    def get_model_wraps(self,test_users,idx,N=20,falh=True):
        model = self.models[idx]
        wraps = []
        for id in tqdm(test_users):
            row_id = self.maps[0][id]
            ranks = model.recommend(row_id,self.mat, N=N, filter_already_liked_items=falh)
            wrp = {}
            for i,(cm,sc) in enumerate(zip(ranks[0],ranks[1])):
                wrp[self.maps[1].get(cm)] = [sc,i]
            wraps += [wrp]
        return wraps
                
    def get_rank_df(self,test_users,N=100):
        first_model_recs,first_model_scores = self.predict_model(test_users,0,N=N)
        users_wraps = []
        for i in range(1,len(self.models)):
            users_wraps += [self.get_model_wraps(test_users,i,N=len(self.maps[1]))]
        
        df = pd.DataFrame()
        for i,(user) in tqdm(enumerate(test_users)):
             for rank,(item,scor) in enumerate(zip(first_model_recs[i],first_model_scores[i])):
                            fc = dict()
                            fc['query'] = i
                            fc['user_id'] = user
                            fc['scor_first'] = scor
                            fc['comunity'] = item
                            fc['rank'] = rank
                            for j in range(len(self.models)-1):
                                if item in users_wraps[j][i].keys():
                                    fc[f'score_ui_model_{j}'] = users_wraps[j][i][item]
                                else:
                                    fc[f'score_ui_model_{j}'] = 0
                            df.append(fc,ignore_index=True)
        return df
        

In [40]:
models = [AlternatingLeastSquares(factors = 1024,
                                  iterations = 32,
                                  calculate_training_loss = False,
                                  regularization = 0.1),
         CosineRecommender(K=16),
         TFIDFRecommender(K=16),
         BM25Recommender(K=16,K1=1.2, B=0.75)
        ]

In [49]:
model = MultiImplicitModel(models=models,
                           maps=[users_mapping,items_inv_mapping],
                           )
model.mat = train_mat

In [42]:
model.fit(train_mat)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]



  0%|          | 0/149114 [00:00<?, ?it/s]



  0%|          | 0/149114 [00:00<?, ?it/s]



  0%|          | 0/149114 [00:00<?, ?it/s]

In [29]:
test_group = test_df[['community_id','customer_id']].groupby('customer_id').agg(lambda x:x.tolist())

In [30]:
wraper = model.get_model_wraps(test_group.index,1,N=len(model.maps[1]),falh=False)

  0%|          | 0/30000 [00:00<?, ?it/s]

In [32]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [33]:
act = test_group['community_id'].tolist()
mapk(act,test_preds,k=7)

NameError: name 'test_preds' is not defined

In [37]:
sorted_tr_data = train_df.sort_values(by='customers_count')[::-1]
best_subs = sorted_tr_data.community_id.unique()[:200]
simple_preds = best_subs[:7].tolist()
mapk(act,[simple_preds] * len(act),k=7)

0.007469630952380953

In [43]:
train_gbm_group = train_df_gbm[['community_id','customer_id']].groupby('customer_id').agg(lambda x:x.tolist())

In [None]:
train_gbm_rank_df = model.get_rank_df(train_gbm_group.index,N=50)

  0%|          | 0/120000 [00:00<?, ?it/s]

  0%|          | 0/120000 [00:00<?, ?it/s]

  0%|          | 0/120000 [00:00<?, ?it/s]

  0%|          | 0/120000 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc,ignore_index=True)
  df.append(fc

In [37]:
train_gbm_rank_df[0]

{'query': 0,
 'user_id': '0000079a765e2fd7fbb3d8c6968474d7a40cc9ada4c9ec430e3e145cac132150e34b54ed01393d202987a80999901b2a5d2017aae5d087a3241c1e4f3ceed6db',
 'scor_first': 0.6658635,
 'comunity': 'e9ab4d850c72c29d8bc0ec6f550b9c551cc45cbc94e76a795c2dabed5bb3d9ab588836bb30ef5d1a7d89f2cb7fe3826297b712b470d901695e2334ef8d96fb8f',
 'rank': 0,
 'score_ui_model_0': 0}

In [None]:
test_rank_df = pd.DataFrame(test_rank_df)

In [18]:
train_gbm_group

NameError: name 'train_gbm_group' is not defined

In [94]:
act = train_gbm_group['community_id'].tolist()
mapk(act,gbm_preds,k=7)

0.07382978954081632

In [99]:
act[1]

['dc3e94b70702c3e6f56f2cbc783e6194e6f4cb8418ed659475ceaacef305d973aea0f3ffd88b7aa17dcd8a9a7cbdbc36ae4d5e078bbf3076c602c61ee5c6e92e',
 '05da1bb7e74509a4d0fa79d302d4d4c29ee72af8f345e59c1469cee9011f76eed07c4ec0764f705814de604ff843bb20339fdc3230f4448d2da9e54f712c9bf8',
 'efe3290b2b6953534845c8278309f2e896a5a03d66b0c471be1d186102c06230428f86cfee6dce92348d8bcf3809df603784163c7a0d52e474ebd39438ed2236',
 '71dc7483ccf5aa27308c2f79bdb20afe5b3d4fe2ddecf7bf86167e368c5e7137b14a27ada172cc5145605a75b654fce731ece21d8f30834750925df8e4ab0558',
 '223f53c800c0ff65d3c72982ce6d333fcb2af30f4fbbccc10e3b03bb7e3044fbd59115ced48f1d7c66f0d2c95cdd2728bfa2a660c18cc9079167c9b530cb9f54']

In [97]:
np.save('test_ranks.npy',act)

In [82]:
rank_df = get_df(train_gbm_group.index,gbm_preds,gbm_scores)

In [83]:
rank_df = set_label(rank_df,train_gbm_group)

  0%|          | 0/8000000 [00:00<?, ?it/s]

In [84]:
rank_df

Unnamed: 0,query,user_id,scor,comunity,label
0,0,0000079a765e2fd7fbb3d8c6968474d7a40cc9ada4c9ec...,0.580307,e9f678831a65bad6bb222d9ee85e631278341739692a31...,0
1,0,0000079a765e2fd7fbb3d8c6968474d7a40cc9ada4c9ec...,0.495415,8ae6f3886a45df709a7a7b097d9c158ba2c5c8b50656de...,0
2,0,0000079a765e2fd7fbb3d8c6968474d7a40cc9ada4c9ec...,0.491877,e9ab4d850c72c29d8bc0ec6f550b9c551cc45cbc94e76a...,0
3,0,0000079a765e2fd7fbb3d8c6968474d7a40cc9ada4c9ec...,0.480178,9869c2a4099ce87effd0b7bbde552b05c5a5af117b51d4...,0
4,0,0000079a765e2fd7fbb3d8c6968474d7a40cc9ada4c9ec...,0.464360,e60bff7c68abea9ce1fd93893701182550acf8384640ce...,0
...,...,...,...,...,...
7999995,159999,ffffea67894044381d713871f6cb6d482a45c031b1272f...,0.167747,0706af45dd47778c976237e0b9065543d798d80cef1cfe...,0
7999996,159999,ffffea67894044381d713871f6cb6d482a45c031b1272f...,0.167710,a6bf2dad0641afd7e7a62d0cf82d914756e1448265171a...,0
7999997,159999,ffffea67894044381d713871f6cb6d482a45c031b1272f...,0.164462,a44668e44253081cdbcc3cf280972e0e1603ec3292083d...,0
7999998,159999,ffffea67894044381d713871f6cb6d482a45c031b1272f...,0.163494,0966fc63347a7a310641447a888ee23573ac3418d44713...,0


In [85]:
rank_df.to_parquet('train_gbm.parquet')

In [86]:
test_pr_df = get_df(test_group.index,test_preds,test_scores)
test_pr_df.to_parquet('test_gbm.parquet')