# Конфигурация

In [1]:
import pickle

import pandas as pd
import numpy as np
from scipy import sparse

from tqdm.auto import tqdm

from implicit.cpu.als import AlternatingLeastSquares
from rectools.models.implicit_als import ImplicitALSWrapperModel

import nmslib

import rectools 
from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP, NDCG, Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.tools.ann import UserToItemAnnRecommender



In [2]:
K_RECOS = 10
RANDOM_SEED = 32

np.random.seed(RANDOM_SEED)

In [3]:
metrics = {'MAP@1': MAP(k=1), 'MAP@5': MAP(k=5), 'MAP@10': MAP(k=10),
           'NDCG@1': NDCG(k=1), 'NDCG@5': NDCG(k=5), 'NDCG@10': NDCG(k=10),
           'Precision@1': Precision(k=1), 'Precision@5': Precision(k=5), 'Precision@10': Precision(k=10),
           'Recall@1': Recall(k=1), 'Recall@5': Recall(k=5), 'Recall@10': Recall(k=10),
           'MeanInvUserFreq@1': MeanInvUserFreq(k=1), 'MeanInvUserFreq@5': MeanInvUserFreq(k=5), 'MeanInvUserFreq@10': MeanInvUserFreq(k=10),
           'Serendipity@1': Serendipity(k=1), 'Serendipity@5': Serendipity(k=5), 'Serendipity@10': Serendipity(k=10)}

# Загрузка данных

In [4]:
def headtail(df):
    return pd.concat([df.head(), df.tail()])

In [5]:
interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

In [6]:
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
    }, 
    inplace=True)

In [7]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [8]:
interactions = Interactions(interactions)

In [9]:
headtail(interactions.df)

Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight
0,176549,9506,2021-05-11,4250,72.0,3.0
1,699317,1659,2021-05-29,8317,100.0,3.0
2,656683,7107,2021-05-09,10,0.0,1.0
3,864613,7638,2021-07-05,14483,100.0,3.0
4,964868,9506,2021-04-30,6725,100.0,3.0
5476246,648596,12225,2021-08-13,76,0.0,1.0
5476247,546862,9673,2021-04-13,2308,49.0,3.0
5476248,697262,15297,2021-08-20,18307,63.0,3.0
5476249,384202,16197,2021-04-19,6203,100.0,3.0
5476250,319709,4436,2021-08-15,3921,45.0,3.0


In [10]:
interactions.df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   total_dur    int64         
 4   watched_pct  float64       
 5   weight       float64       
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 250.7 MB


In [11]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

# Подготовка данных

Удаляем юзеров и айтемы, у которых нет взаимодействий

In [12]:
# users = users.loc[users["user_id"].isin(interactions.df["user_id"])].copy()
# items = items.loc[items["item_id"].isin(interactions.df["item_id"])].copy()

In [13]:
users.fillna('Unknown', inplace=True)

производится Leave-time-out разбиение -- последняя неделя на test, остальное на train

In [14]:
max_date = interactions.df['datetime'].max()

train = interactions.df[(interactions.df['datetime'] < max_date - pd.Timedelta(days=7))]
train.drop(train.query("total_dur < 300").index, inplace=True)
train_users = train[Columns.User].unique()

test = interactions.df[interactions.df['datetime'] >= max_date - pd.Timedelta(days=7)]
test_users = test[Columns.User].unique()

hot_test = test[test['user_id'].isin(train['user_id'].unique())]
hot_test_users = hot_test[Columns.User].unique()

cold_test = test[~test['user_id'].isin(train['user_id'].unique())]
cold_test_users = cold_test[Columns.User].unique()

catalog = train[Columns.Item].unique()

print(f"train: {train.shape}")
print(f"hot test: {hot_test.shape}")
print(f"cold test: {cold_test.shape}")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(train.query("total_dur < 300").index, inplace=True)


train: (3832711, 6)
hot test: (333026, 6)
cold test: (157956, 6)


В качестве фичей юзеров были выбраны: возраст, доход, пол

In [15]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [16]:
user_features_names = ["age", "income", "sex"]
cat_user_features_names = [feature for feature in user_features_names if users[feature].dtype == np.dtypes.ObjectDType]
user_features_frames = []
for feature in user_features_names:
    feature_frame = users.reindex(columns=["user_id", feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

In [17]:
print('Number user features:', users[user_features_names].nunique().sum())

Number user features: 17


В качестве фичей айтемов были выбраны: жанр

In [18]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [19]:
YEAR_FROM = 1990
STEP_SIZE = 5
bins = [year for year in range(YEAR_FROM, int(items['release_year'].max()) + STEP_SIZE, STEP_SIZE)]
bins = [int(items['release_year'].min())] + bins
items['year_bin'] = pd.cut(items['release_year'],
                           bins=bins, include_lowest=True)
items['year_bin'] = items['year_bin'].astype('str')

In [20]:
item_features_names = []

In [21]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
item_features_names += ["genre"]
item_features = genre_feature

In [22]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features_names += ["content_type"]

In [23]:
year_feature = items.reindex(columns=[Columns.Item, "year_bin"])
year_feature.columns = ["id", "value"]
year_feature["feature"] = "year_bin"
item_features_names += ["year_bin"]

In [24]:
cat_item_features_names = [feature for feature in item_features_names if items[feature].dtype == np.dtypes.ObjectDType]

In [25]:
item_features = pd.concat((genre_feature, content_feature, year_feature))

In [26]:
genre_feature["value"].value_counts()

драмы                 4923
комедии               3479
зарубежные            3055
мелодрамы             2533
триллеры              2297
                      ... 
токшоу                   2
красота и здоровье       2
передачи                 1
образование              1
рекомендуем              1
Name: value, Length: 94, dtype: int64

In [27]:
print('Number item features:', item_features.value.nunique())

Number item features: 105


In [28]:
dataset_train = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=user_features_names,
    item_features_df=item_features,
    cat_item_features=cat_item_features_names
)

# Обучение модели

## ALS

Гиперпараметры не тюнились, их значения брались из обучающих материалов

In [38]:
N_FACTORS = 128
REG = 0.5
ALPHA = 10
NUM_THREADS = 12
ITERATIONS = 10

In [39]:
als = AlternatingLeastSquares(factors=N_FACTORS,
                              regularization=REG,
                              alpha=ALPHA,
                              num_threads=NUM_THREADS,
                              random_state=RANDOM_SEED,
                              iterations=ITERATIONS)

In [40]:
als_wrapper = ImplicitALSWrapperModel(model=als,
                                      verbose=1,
                                      fit_features_together=True)

In [41]:
%%time
als_wrapper.fit(dataset_train)

  0%|          | 0/10 [00:00<?, ?it/s]

CPU times: total: 4min 15s
Wall time: 24.3 s


<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x1f1979e2b80>

In [42]:
with open("als_wrapper.pkl", "wb") as file:
    pickle.dump(als_wrapper, file)

In [43]:
with open("als_wrapper.pkl", "rb") as file:
    als_wrapper = pickle.load(file)

In [44]:
%%time
recos_als_train = als_wrapper.recommend(users=train_users, dataset=dataset_train, k=10, filter_viewed=False)

CPU times: total: 3min 40s
Wall time: 55.2 s


In [45]:
%%time
recos_als_hot_test = als_wrapper.recommend(users=hot_test_users, dataset=dataset_train, k=10, filter_viewed=True)

CPU times: total: 2min 39s
Wall time: 23.7 s


# Раcчёт метрик

## на обучающей выборке

In [46]:
metric_values_train = calc_metrics(
    metrics,
    reco=recos_als_train,
    interactions=train,
    prev_interactions=train,
    catalog=catalog,
)

In [47]:
pd.DataFrame(metric_values_train, index=['AlternatingLeastSquares'])

Unnamed: 0,Precision@1,Recall@1,Precision@5,Recall@5,Precision@10,Recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
AlternatingLeastSquares,0.607888,0.321024,0.260478,0.497448,0.175948,0.588805,0.607888,0.327639,0.245187,0.321024,0.424875,0.45323,3.586888,4.108002,4.790572,0.001484,0.000833,0.000684


## на тёплых пользователях

In [48]:
metric_values_hot_test = calc_metrics(
    metrics,
    reco=recos_als_hot_test,
    interactions=hot_test,
    prev_interactions=train,
    catalog=catalog,
)

In [49]:
pd.DataFrame(metric_values_hot_test, index=['AlternatingLeastSquares'])

Unnamed: 0,Precision@1,Recall@1,Precision@5,Recall@5,Precision@10,Recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
AlternatingLeastSquares,0.08392,0.042552,0.049743,0.118161,0.035223,0.161937,0.08392,0.057038,0.044345,0.042552,0.071578,0.078454,3.487663,4.25787,4.99543,3.7e-05,4.3e-05,4.7e-05


# Использовать метод приближенного поиска соседей для выдачи рекомендаций (пункт 2 задание 2) (3 балла)

In [50]:
user_vectors, item_vectors = als_wrapper.get_vectors()

In [51]:
user_id_map = dataset_train.user_id_map
item_id_map = dataset_train.item_id_map

In [52]:
with open("user_id_map.pkl", "wb") as file:
    pickle.dump(dataset_train.user_id_map, file)

In [53]:
with open("item_id_map.pkl", "wb") as file:
    pickle.dump( dataset_train.item_id_map, file)

In [54]:
index_init_params = {"method": "hnsw", "space": "negdotprod", "data_type": nmslib.DataType.DENSE_VECTOR}

In [55]:
ann = UserToItemAnnRecommender(user_vectors=user_vectors,
                               item_vectors=item_vectors,
                               user_id_map=user_id_map,
                               item_id_map=item_id_map,
                               index_init_params= index_init_params)

In [56]:
%%time
ann.fit()

CPU times: total: 234 ms
Wall time: 532 ms


<rectools.tools.ann.UserToItemAnnRecommender at 0x1f19b6f9670>

In [57]:
# ann.index.saveIndex(filename='ann_index.pkl')

In [58]:
ann.index.loadIndex('ann_index.pkl')

In [59]:
ann.get_item_list_for_user(200197, top_n=10).tolist()

[13865, 9728, 3734, 10440, 15297, 12995, 4457, 8636, 11237, 4436]

In [60]:
%%time
als_wrapper.recommend(users=[200197], dataset=dataset_train, k=10, filter_viewed=False).item_id.values

CPU times: total: 375 ms
Wall time: 235 ms


array([13865,  9728,  3734, 10440, 15297, 12995,  4457,  8636, 11237,
        4436], dtype=int64)

In [61]:
%%time
recos_hot_ann = pd.DataFrame([hot_test_users,
                              ann.get_item_list_for_user_batch(user_ids=hot_test_users,
                                                               top_n=K_RECOS)]).T
recos_hot_ann.columns = [Columns.User, Columns.Item]
recos_hot_ann = recos_hot_ann.explode(Columns.Item)
recos_hot_ann['score'] = 0
recos_hot_ann['rank'] = recos_hot_ann.groupby('user_id').cumcount() + 1
recos_hot_ann['score'] = K_RECOS - recos_hot_ann['rank']

CPU times: total: 56.5 s
Wall time: 1min 5s


In [62]:
metric_values_hot_ann = calc_metrics(
    metrics,
    reco=recos_hot_ann,
    interactions=hot_test,
    prev_interactions=train,
    catalog=catalog,
)

In [63]:
pd.DataFrame(metric_values_hot_ann, index=['AlternatingLeastSquares + ANN'])

Unnamed: 0,Precision@1,Recall@1,Precision@5,Recall@5,Precision@10,Recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@10,Serendipity@1,Serendipity@5,Serendipity@10
AlternatingLeastSquares + ANN,0.035007,0.016543,0.038371,0.09349,0.029209,0.136421,0.035007,0.038562,0.032129,0.016543,0.044085,0.050649,3.781641,4.229654,4.860103,2.1e-05,3e-05,3.4e-05


## Сравнение производительности

Рекомендации для тёплых юзеров с помощью ALS выдаются от 30 до 60 секунд

Рекомендации для тёплых юзеров с помощью ALS + ANN выдаются порядка минуты

Данное ухудшение производительности можно объяснить тем, что ALS использует все потоки, а ANN -- один поток, хотя у индекса nsmlib метод knnQueryBatch поумолчанию работает в многопоточном режиме, но при использовании обёртки ANN в rectools почему-то наблюдается использование одного потока (пытался вручную выставить num_threads в обёртке ANN в rectools, но производительность не изменилась)

In [64]:
%%time
recos_hot = als_wrapper.recommend(users=hot_test_users[:100], dataset=dataset_train, k=10, filter_viewed=True)

CPU times: total: 703 ms
Wall time: 525 ms


In [65]:
%%time
recos_hot_ann_test = pd.DataFrame([hot_test_users[:100],
                              ann.get_item_list_for_user_batch(user_ids=hot_test_users[:100],
                                                               top_n=K_RECOS)]).T
recos_hot_ann_test.columns = [Columns.User, Columns.Item]
recos_hot_ann_test = recos_hot_ann_test.explode(Columns.Item)
recos_hot_ann_test['score'] = 0
recos_hot_ann_test['rank'] = recos_hot_ann_test.groupby('user_id').cumcount() + 1
recos_hot_ann_test['score'] = K_RECOS - recos_hot_ann_test['rank']

CPU times: total: 234 ms
Wall time: 110 ms


Несмотря на проигрыш в множественной обработке, ANN быстрее работает с одиночными вызовами и малыми батчами

Выдачу рекомендаций для 100 юзеров ANN отработал примерно в 4 раза быстрее ALS