In [1]:
import pandas as pd
import numpy as np
from tools import ItemEncoder, get_rec_als, split_data
from scipy import sparse as sp
from tqdm.notebook import tqdm
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from implicit.evaluation import train_test_split, ndcg_at_k, precision_at_k, AUC_at_k

In [None]:
#split train to global_train and global_test. Make features and save to files
#%run prepare_data.py

In [2]:
global_train = pd.read_parquet('files/train_full.parquet.gzip')
global_train.head()

Unnamed: 0,user_id,item_id,timespent,reaction
23738507,0,6009,1,0
23741945,0,224625,2,0
23754730,0,3814,1,0
32633576,0,201408,2,0
77635843,0,178379,1,0


In [3]:
# Split Global train to local train/test
local_train, local_test = split_data(global_train, test_size=0.2)
local_train.head()

100%|████████████████████████████████████████████████████████████████████| 1000183/1000183 [00:02<00:00, 491601.50it/s]


Unnamed: 0,user_id,item_id,timespent,reaction
23738507,0,6009,1,0
23741945,0,224625,2,0
23754730,0,3814,1,0
32633576,0,201408,2,0
1174594,1,93689,2,0


In [4]:
local_test.head()

Unnamed: 0,user_id,item_id,timespent,reaction
77635843,0,178379,1,0
22981493,1,107288,1,0
22981494,1,126834,11,0
22981495,1,86036,3,0
25411473,1,28460,2,0


In [5]:
local_train['item_id'].nunique(), local_test['item_id'].nunique()

(214490, 177366)

In [None]:
local_train.groupby('user_id')['item_id'].nunique().hist()
local_test.groupby('user_id')['item_id'].nunique().hist(alpha=0.2);

In [8]:
#local_train.to_parquet('files/local_train.parquet.gzip')
#local_test.to_parquet('files/local_test.parquet.gzip')

# Подготовка кандидатов для модели ранжирования

In [6]:
# Make matrix
coder = ItemEncoder(local_train)
user_items_interaction = coder.make_csr_data(local_train)
user_items_weights = coder.make_csr_data(local_train, weights='timespent')

In [7]:
bm25_int = bm25_weight(user_items_interaction, K1=80, B=0.8).tocsr()
bm25_weights = bm25_weight(user_items_weights, K1=80, B=0.8).tocsr()

In [53]:
def choise_matrix(matrixs, size=250000, metrics_dict=metrics):
    for k, matrix in enumerate(tqdm(matrixs)):
        train_data, test_data = train_test_split(matrix[:size], train_percentage=0.7, random_state=42)
        model_als = AlternatingLeastSquares(factors=200, iterations=2, num_threads=-1, random_state=42)
        model_als.fit(train_data)
        metrics_dict['model'].append('matrix_{}'.format(k))
        metrics_dict['ndgc@20'].append(round(ndcg_at_k(model_als, train_data, test_data, K=20, num_threads=-1),2))
        metrics_dict['precision@20'].append(round(precision_at_k(model_als, train_data, test_data, K=20, num_threads=-1),2))
    return metrics_dict

In [54]:
matrixs = [user_items_interaction,user_items_weights,bm25_int, bm25_weights]

In [55]:
metrics = {
    'model':[],
    'ndgc@20':[],
    'precision@20':[],
}

In [56]:
metrics = choise_matrix(matrixs, size=100000)
metrics

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/89667 [00:00<?, ?it/s]

  0%|          | 0/89667 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/89667 [00:00<?, ?it/s]

  0%|          | 0/89667 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/89667 [00:00<?, ?it/s]

  0%|          | 0/89667 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/89667 [00:00<?, ?it/s]

  0%|          | 0/89667 [00:00<?, ?it/s]

{'model': ['matrix_0', 'matrix_1', 'matrix_2', 'matrix_3'],
 'ndgc@20': [0.04388068707041237,
  0.04191068548138226,
  0.045541311124231286,
  0.0453941204598418],
 'precision@20': [0.0648989964534907,
  0.061069332722978216,
  0.06360120529404409,
  0.06474326151435711]}

In [12]:
model_als = AlternatingLeastSquares(factors=200, iterations=2, num_threads=-1, random_state=42)
model_als.fit(bm25_weights)

  0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
users, rec = get_rec_als(model_als, local_train['user_id'].unique(), bm25_weights, coder)

100%|█████████████████████████████████████████████████████████████████████| 1000183/1000183 [01:02<00:00, 15938.16it/s]


In [14]:
candidates_als = pd.DataFrame({'user_id':users, 'item_id': rec})
candidates_als.head()

Unnamed: 0,user_id,item_id
0,0,"[185506, 146366, 222176, 209249, 55067, 214894..."
1,1,"[117035, 46502, 136266, 221256, 113908, 218559..."
2,2,"[114561, 125070, 105277, 14760, 21100, 154155,..."
3,3,"[49589, 154155, 226527, 89732, 39846, 105277, ..."
4,4,"[158392, 224569, 20023, 35482, 43635, 155056, ..."


In [15]:
candidates_als.to_parquet('files/candidates_als_train.parquet.gzip', compression='gzip')