In [4]:
import pandas as pd
import numpy as np
from tools import ItemEncoder, get_rec_als, split_data
from prepare_data import make_data
from scipy import sparse as sp
from tqdm.notebook import tqdm
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from implicit.evaluation import train_test_split, ndcg_at_k, precision_at_k, AUC_at_k

In [5]:
make_data()

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [7]:
global_train = pd.read_parquet('files/global_train.parquet.gzip')
global_train.head()

Unnamed: 0,user_id,item_id,timespent,reaction
12333256,0,35236,0,0
19690502,0,186864,0,0
21816147,0,58724,0,0
21816148,0,155390,0,0
21816149,0,153029,0,0


In [8]:
# Split Global train to local train/test
local_train, local_test = split_data(global_train, test_size=0.2)
local_train.head()

100%|█████████████████████████████| 1000183/1000183 [00:01<00:00, 825479.10it/s]


Unnamed: 0,user_id,item_id,timespent,reaction
12333256,0,35236,0,0
19690502,0,186864,0,0
21816147,0,58724,0,0
21816148,0,155390,0,0
21816149,0,153029,0,0


In [9]:
local_test.head()

Unnamed: 0,user_id,item_id,timespent,reaction
71718873,0,182023,0,0
71720504,0,55382,0,0
71720505,0,158027,0,0
77635843,0,178379,1,0
77643729,0,215675,1,0


In [10]:
local_train['item_id'].nunique(), local_test['item_id'].nunique()

(225156, 221219)

In [11]:
local_train.groupby('user_id')['item_id'].nunique().hist()
local_test.groupby('user_id')['item_id'].nunique().hist(alpha=0.2);

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [None]:
#local_train.to_parquet('files/local_train.parquet.gzip')
#local_test.to_parquet('files/local_test.parquet.gzip')

# Подготовка кандидатов для модели ранжирования

In [12]:
# Make matrix
coder = ItemEncoder(local_train)
user_items_interaction = coder.make_csr_data(local_train)
user_items_weights = coder.make_csr_data(local_train, weights='timespent')

In [14]:
bm25_int = bm25_weight(user_items_interaction, K1=80, B=0.8).tocsr()
bm25_weights = bm25_weight(user_items_weights, K1=80, B=0.8).tocsr()

In [17]:
def choise_matrix(matrixs, metrics_dict, size=250000):
    for k, matrix in enumerate(tqdm(matrixs)):
        train_data, test_data = train_test_split(matrix[:size], train_percentage=0.7, random_state=42)
        model_als = AlternatingLeastSquares(factors=200, iterations=2, num_threads=-1, random_state=42)
        model_als.fit(train_data)
        metrics_dict['model'].append('matrix_{}'.format(k))
        metrics_dict['ndgc@20'].append(round(ndcg_at_k(model_als, train_data, test_data, K=20, num_threads=-1),2))
        metrics_dict['precision@20'].append(round(precision_at_k(model_als, train_data, test_data, K=20, num_threads=-1),2))
    return metrics_dict

In [18]:
matrixs = [user_items_interaction,user_items_weights,bm25_int, bm25_weights]

In [19]:
metrics = {
    'model':[],
    'ndgc@20':[],
    'precision@20':[],
}

In [20]:
metrics = choise_matrix(matrixs, metrics, size=100000)
metrics

  0%|          | 0/4 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/99789 [00:00<?, ?it/s]

  0%|          | 0/99789 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/89998 [00:00<?, ?it/s]

  0%|          | 0/89998 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/99789 [00:00<?, ?it/s]

  0%|          | 0/99789 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/89998 [00:00<?, ?it/s]

  0%|          | 0/89998 [00:00<?, ?it/s]

{'model': ['matrix_0', 'matrix_1', 'matrix_2', 'matrix_3'],
 'ndgc@20': [0.14, 0.06, 0.19, 0.08],
 'precision@20': [0.14, 0.09, 0.18, 0.1]}

In [21]:
model_als = AlternatingLeastSquares(factors=200, iterations=2, num_threads=-1, random_state=42)
model_als.fit(bm25_int)

  0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
users, rec = get_rec_als(model_als, local_train['user_id'].unique(), bm25_weights, coder)

100%|███████████████████████████████| 1000183/1000183 [04:09<00:00, 4014.77it/s]


In [23]:
candidates_als = pd.DataFrame({'user_id':users, 'item_id': rec})
candidates_als.head()

Unnamed: 0,user_id,item_id
0,0,"[32348, 55382, 153923, 219579, 158027, 104133,..."
1,1,"[28460, 113908, 221256, 3358, 218559, 17060, 2..."
2,2,"[227415, 159643, 71334, 128871, 45895, 20540, ..."
3,3,"[32955, 24091, 185695, 32220, 197842, 24228, 1..."
4,4,"[13659, 45739, 221911, 87797, 96645, 215308, 2..."


In [27]:
candidates_als.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000183 entries, 0 to 1000182
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1000183 non-null  int64 
 1   item_id  1000183 non-null  object
dtypes: int64(1), object(1)
memory usage: 15.3+ MB


In [29]:
candidates_als['user_id'] = candidates_als['user_id'].astype('int32')

In [30]:
candidates_als = candidates_als.explode('item_id').astype()

AttributeError: 'DataFrame' object has no attribute 'extend'

In [28]:
candidates_als.to_parquet('files/candidates_als_train.parquet.gzip')

ValueError: Error converting column "item_id" to bytes using encoding JSON. Original error: Object of type int32 is not JSON serializable