In [1]:
from implicit.als import AlternatingLeastSquares
import numpy as np
import scipy as sc
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics import ndcg_score
from tqdm import tqdm

# В этом ноутбуке обучаем ALS для рекомендаций

Основная идея ALS - разложение большой разряженной матрицы на 2 меньше размером users = (n_users x n_factos) и songs = (n_factors x n_songs). Где их произведение аппроксимирует большую матрицу и позволяет делать предсказания. Однако фичи пользователей и песен использовать мы тут не можем.

In [2]:
# Read the data
train = pd.read_csv("train_processed.csv", index_col=0)
val = pd.read_csv("val_processed.csv", index_col=0)
test = pd.read_csv("test_processed.csv", index_col=0)

In [3]:
train = train[(train["song_id"] >= 0) & (train["msno"] >= 0)]
val = val[(val["song_id"] >= 0) & (val["msno"] >= 0)]
test = test[(test["song_id"] >= 0) & (test["msno"] >= 0)]

In [4]:
members_data = pd.read_csv("members_data_processed.csv", index_col=0)
songs_data = pd.read_csv("songs_data_processed.csv", index_col=0)

In [5]:
# Take only target 1
train_tg1 = train[train['target'] == 1]
val_tg1 = val[val['target'] == 1]

In [6]:
msno_idxs_train = list(train_tg1["msno"])
song_id_idxs_train = list(train_tg1["song_id"])
data_train = [1] * len(msno_idxs_train)

In [7]:
# Make user-items sparse matrix
user_items_train = csr_matrix((data_train, (msno_idxs_train, song_id_idxs_train)), shape=(len(members_data), len(songs_data)))

In [8]:
msno_idxs_val = list(val_tg1["msno"])
song_id_idxs_val = list(val_tg1["song_id"])
data_val = [1] * len(msno_idxs_val)

In [9]:
# Make user-items val matrix
user_items_val = csr_matrix((data_val, (msno_idxs_val, song_id_idxs_val)), shape=(len(members_data), len(songs_data)))

In [10]:
# Init model
model = AlternatingLeastSquares(factors=64, num_threads=8)
model.fit(user_items_train)

  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

In [30]:
# Function to evaluate the model by taking average NDCG@20
def eval_als(model, user_list, user_items_train, user_items_test, batch_size=256):
    mean_ndcg_score = 0
    for idx in tqdm(range(0, len(user_list), batch_size), total=len(user_list) // batch_size):
        end_idx = min(idx + batch_size, len(user_list))
        user_ids = user_list[idx:end_idx]

        ids, scores = model.recommend(user_ids, user_items_train[user_ids], N=20, filter_already_liked_items=True)

        for user_id, single_id, single_score in zip(user_ids, ids, scores):
            mean_ndcg_score += ndcg_score(user_items_test[user_id, single_id].toarray(), single_score[np.newaxis, :], k=20)
    return mean_ndcg_score / len(user_list)

In [33]:
# Make grid search
import itertools
iters_arr = [3, 8, 10]
factors_arr = [16, 32, 64]

members_list = list(members_data['msno'])
batch_size = 256

mean_ndcg_scores = []

for iters, factors in itertools.product(iters_arr, factors_arr):
    model = AlternatingLeastSquares(factors=factors, num_threads=4, iterations=iters)
    model.fit(user_items_train)
    mean_ndcg_score = 0
    
    mean_ndcg_score = eval_als(model, members_list, user_items_train, user_items_val)
    
    mean_ndcg_scores.append((mean_ndcg_score, iters, factors))

  0%|          | 0/3 [00:00<?, ?it/s]

135it [05:01,  2.23s/it]                                                                                               


  0%|          | 0/3 [00:00<?, ?it/s]

135it [05:09,  2.29s/it]                                                                                               


  0%|          | 0/3 [00:00<?, ?it/s]

135it [05:33,  2.47s/it]                                                                                               


  0%|          | 0/8 [00:00<?, ?it/s]

135it [04:57,  2.20s/it]                                                                                               


  0%|          | 0/8 [00:00<?, ?it/s]

135it [05:04,  2.26s/it]                                                                                               


  0%|          | 0/8 [00:00<?, ?it/s]

135it [05:33,  2.47s/it]                                                                                               


  0%|          | 0/10 [00:00<?, ?it/s]

135it [04:57,  2.21s/it]                                                                                               


  0%|          | 0/10 [00:00<?, ?it/s]

135it [05:03,  2.25s/it]                                                                                               


  0%|          | 0/10 [00:00<?, ?it/s]

135it [05:33,  2.47s/it]                                                                                               


In [34]:
# Grid search results
mean_ndcg_scores

[(0.15288618981878643, 3, 16),
 (0.1553390233871362, 3, 32),
 (0.15241907491225676, 3, 64),
 (0.15504213540863163, 8, 16),
 (0.15563592772046222, 8, 32),
 (0.15116959413711512, 8, 64),
 (0.15531999027814972, 10, 16),
 (0.15572520158936604, 10, 32),
 (0.15168789602777197, 10, 64)]

In [35]:
test_tg1 = test[test['target'] == 1]
msno_idxs_test = list(test_tg1["msno"])
song_id_idxs_test = list(test_tg1["song_id"])
data_test = [1] * len(msno_idxs_test)

In [36]:
user_items_test = csr_matrix((data_test, (msno_idxs_test, song_id_idxs_test)), shape=(len(members_data), len(songs_data)))

In [37]:
# Трейн + валидационные данные можно взять как сумму двух матриц
user_items_train_all = user_items_train + user_items_val

In [38]:
# Make best model based on grid searcg
best_model_params = sorted(mean_ndcg_scores, key=lambda x: -x[0])[0]
best_model = AlternatingLeastSquares(factors=best_model_params[2], iterations=best_model_params[1], num_threads=8)
best_model.fit(user_items_train_all)

ndcg_score = eval_als(best_model, members_list, user_items_train_all, user_items_test)

  0%|          | 0/10 [00:00<?, ?it/s]

135it [05:11,  2.31s/it]                                                                                               


In [43]:
# Total test ndcg@20 score

print(f"Test NDCG@20 score: {ndcg_score}")

Test NDCG@20 score: 0.13228656058119517
