# Initialization

In [1]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

# Загрузка данных

In [3]:
items = pd.read_parquet("items.par")
events = pd.read_parquet("events.par")

# Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

# === Знакомство: "холодный" старт

In [4]:
# зададим точку разбиения
train_test_global_time_split_date = pd.to_datetime("2017-08-01").date()

train_test_global_time_split_idx = events["started_at"] < train_test_global_time_split_date
events_train = events[train_test_global_time_split_idx]
events_test = events[~train_test_global_time_split_idx]

# количество пользователей в train и test
users_train = events_train["user_id"].drop_duplicates()
users_test = events_test["user_id"].drop_duplicates()
# количество пользователей, которые есть и в train, и в test
common_users = set(users_train) & set(users_test)

print(len(users_train), len(users_test), len(common_users)) 

428220 123223 120858


In [5]:
cold_users = users_test[~users_test.isin(users_train)]

print(len(cold_users)) 

2365


In [6]:
top_pop_start_date = pd.to_datetime("2015-01-01").date()

item_popularity = events_train \
    .query("started_at >= @top_pop_start_date") \
    .groupby(["item_id"]).agg(users=("user_id", "nunique"), avg_rating=("rating", "mean")).reset_index()
item_popularity["popularity_weighted"] = item_popularity["users"] * item_popularity["avg_rating"]

# сортируем по убыванию взвешенной популярности
item_popularity = item_popularity.sort_values(["popularity_weighted"], ascending=False)

# выбираем первые 100 айтемов со средней оценкой avg_rating не меньше 4
top_k_pop_items = item_popularity[item_popularity["avg_rating"] > 4].head(100)

In [7]:
top_k_pop_items

Unnamed: 0,item_id,users,avg_rating,popularity_weighted
32387,18007564,20207,4.321275,87320.0
32623,18143977,19462,4.290669,83505.0
30695,16096824,16770,4.301014,72128.0
2,3,15139,4.706057,71245.0
3718,38447,14611,4.232770,61845.0
...,...,...,...,...
19596,2767052,4361,4.413437,19247.0
32835,18293427,4674,4.092640,19129.0
378,3636,4667,4.098564,19128.0
33611,18966819,4361,4.374914,19079.0


In [8]:
# добавляем информацию о книгах
top_k_pop_items = top_k_pop_items.merge(
    items.set_index("item_id")[["author", "title", "genre_and_votes", "publication_year"]], on="item_id")

with pd.option_context('display.max_rows', 100):
    display(top_k_pop_items[["item_id", "author", "title", "publication_year", "users", "avg_rating", "popularity_weighted", "genre_and_votes"]]) 

Unnamed: 0,item_id,author,title,publication_year,users,avg_rating,popularity_weighted,genre_and_votes
0,18007564,Andy Weir,The Martian,2014.0,20207,4.321275,87320.0,"{'Science Fiction': 11966, 'Fiction': 8430}"
1,18143977,Anthony Doerr,All the Light We Cannot See,2014.0,19462,4.290669,83505.0,"{'Historical-Historical Fiction': 13679, 'Fict..."
2,16096824,Sarah J. Maas,A Court of Thorns and Roses (A Court of Thorns...,2015.0,16770,4.301014,72128.0,"{'Fantasy': 14326, 'Young Adult': 4662, 'Roman..."
3,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,1997.0,15139,4.706057,71245.0,"{'Fantasy': 59818, 'Fiction': 17918, 'Young Ad..."
4,38447,Margaret Atwood,The Handmaid's Tale,1998.0,14611,4.23277,61845.0,"{'Fiction': 15424, 'Classics': 9937, 'Science ..."
5,15881,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,1999.0,13043,4.632447,60421.0,"{'Fantasy': 50130, 'Young Adult': 15202, 'Fict..."
6,11235712,Marissa Meyer,"Cinder (The Lunar Chronicles, #1)",2012.0,14348,4.179189,59963.0,"{'Young Adult': 10539, 'Fantasy': 9237, 'Scien..."
7,17927395,Sarah J. Maas,A Court of Mist and Fury (A Court of Thorns an...,2016.0,12177,4.73064,57605.0,"{'Fantasy': 10186, 'Romance': 3346, 'Young Adu..."
8,18692431,"Nicola Yoon, David Yoon","Everything, Everything",2015.0,14121,4.071454,57493.0,"{'Young Adult': 5175, 'Romance': 3234, 'Contem..."
9,5,"J.K. Rowling, Mary GrandPré",Harry Potter and the Prisoner of Azkaban (Harr...,2004.0,11890,4.770143,56717.0,"{'Fantasy': 49784, 'Young Adult': 15393, 'Fict..."


In [9]:
cold_users_events_with_recs = \
    events_test[events_test["user_id"].isin(cold_users)] \
    .merge(top_k_pop_items.set_index("item_id")["avg_rating"], on="item_id", how="left")

cold_user_items_no_avg_rating_idx = cold_users_events_with_recs["avg_rating"].isnull()
cold_user_recs = cold_users_events_with_recs[~cold_user_items_no_avg_rating_idx] \
    [["user_id", "item_id", "rating", "avg_rating"]]

In [10]:
cold_users_events_with_recs[~cold_user_items_no_avg_rating_idx]

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_enc,avg_rating
12,00542b28105d345f43732ed99791c5b2,1885,2017-10-04,2017-10-06,True,5,False,2017-10-01,1000504,4.316316
15,00729c108d192a9f146121683ea93d70,13496,2017-08-08,2017-08-12,True,5,False,2017-08-01,1000712,4.440779
22,00edde290a5c3c6994c23e05f8c5710d,1885,2017-09-15,2017-09-19,True,5,False,2017-09-01,1001508,4.316316
24,015b058cd8c3cf8dfd54543036b2552a,18966819,2017-08-08,2017-08-13,True,5,True,2017-08-01,1002222,4.374914
26,015b058cd8c3cf8dfd54543036b2552a,15839976,2017-08-04,2017-08-07,True,4,True,2017-08-01,1002222,4.150180
...,...,...,...,...,...,...,...,...,...,...
9637,ff388b28894b9b1bdc52a63e02676a9f,18584855,2017-10-10,2017-10-14,True,4,True,2017-10-01,1429316,4.071619
9638,ff388b28894b9b1bdc52a63e02676a9f,9361589,2017-10-09,2017-10-16,True,2,True,2017-10-01,1429316,4.085858
9642,ff71a402b1a4895a18f2e17abb201da0,9969571,2017-08-13,2017-08-25,True,4,False,2017-08-01,1429720,4.290044
9643,ff7e01b5844ff89f8965ba33708ba883,11235712,2017-10-22,2017-10-24,True,5,False,2017-10-01,1429798,4.179189


In [11]:
# посчитаем метрики рекомендаций
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

rmse = root_mean_squared_error(cold_user_recs["rating"], cold_user_recs["avg_rating"])
mae = mean_absolute_error(cold_user_recs["rating"], cold_user_recs["avg_rating"])
print(round(rmse, 2), round(mae, 2))

0.78 0.62


In [12]:
# посчитаем покрытие холодных пользователей рекомендациями

cold_users_hit_ratio = cold_users_events_with_recs.groupby("user_id").agg(hits=("avg_rating", lambda x: (~x.isnull()).mean()))

print(f"Доля пользователей без релевантных рекомендаций: {(cold_users_hit_ratio == 0).mean().iat[0]:.2f}")
print(f"Среднее покрытие пользователей: {cold_users_hit_ratio[cold_users_hit_ratio['hits'] != 0].mean().iat[0]:.2f}") 

Доля пользователей без релевантных рекомендаций: 0.59
Среднее покрытие пользователей: 0.44


In [13]:
cold_users_hit_ratio == 0

Unnamed: 0_level_0,hits
user_id,Unnamed: 1_level_1
001ae592ce3cdb7abb6f19b9b4d19638,True
0034bb27bb201328b1781f8c0897e612,True
00542b28105d345f43732ed99791c5b2,False
00729c108d192a9f146121683ea93d70,False
008193fdfca6563d5d3816acd4f1059a,True
...,...
ff83f5ecb45dac6f1946534b07a7c18f,False
ff8ab8e812178091f1d5c30bdca7edd9,True
ffa5a298bacd65ad27e3b495596a8854,True
ffd5d54aadefbc17feb8a6a0e7a89bb4,True


In [14]:
cold_users_hit_ratio[cold_users_hit_ratio["hits"] != 0]

Unnamed: 0_level_0,hits
user_id,Unnamed: 1_level_1
00542b28105d345f43732ed99791c5b2,0.333333
00729c108d192a9f146121683ea93d70,0.333333
00edde290a5c3c6994c23e05f8c5710d,0.333333
015b058cd8c3cf8dfd54543036b2552a,0.500000
017a95d545a555d331a1a866203035e7,0.500000
...,...
ff321092e01b7af9f2222a32bf70beb5,0.250000
ff388b28894b9b1bdc52a63e02676a9f,0.500000
ff71a402b1a4895a18f2e17abb201da0,0.500000
ff7e01b5844ff89f8965ba33708ba883,0.500000


# === Знакомство: первые персональные рекомендации

In [15]:
events_test[['user_id', 'item_id', 'rating']].sample(5)

Unnamed: 0,user_id,item_id,rating
8084869,4ac1d6f5d6f9f2930d13c656ea218026,14383,4
10310190,3e146bd5bd01924b67926305fdcf5691,3,3
6518828,960a479cffb14e601cd13b33677bc066,17165932,3
2028640,3257771ed46776822fb41b898566f4ed,6185,5
10903341,00df903feb9e61aefaa6986340c2d000,58822,3


In [16]:
from surprise import Dataset, Reader
from surprise import SVD

# используем Reader из библиотеки surprise для преобразования событий (events)
# в формат, необходимый surprise
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(events_train[['user_id', 'item_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

# инициализируем модель
svd_model = SVD(n_factors=100, random_state=0)

# обучаем модель
svd_model.fit(surprise_train_set) 

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f15fc2de0b0>

In [17]:
surprise_test_set = list(events_test[['user_id', 'item_id', 'rating']].itertuples(index=False))

# получаем рекомендации для тестовой выборки
svd_predictions = svd_model.test(surprise_test_set) 

In [18]:
from surprise import accuracy

rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)
                     
print(rmse, mae) 

RMSE: 0.8262
MAE:  0.6460
0.8261606596586795 0.645993803519407


In [19]:
from surprise import NormalPredictor

# инициализируем состояние генератора, это необходимо для получения
# одной и той же последовательности случайных чисел, только в учебных целях
np.random.seed(0)

random_model = NormalPredictor()

random_model.fit(surprise_train_set)
random_predictions = random_model.test(surprise_test_set) 

In [20]:
rmse_rand = accuracy.rmse(random_predictions)
mae_rand = accuracy.mae(random_predictions)
                     
print(rmse, mae) 

RMSE: 1.2610
MAE:  1.0004
0.8261606596586795 0.645993803519407


In [21]:
mae_rand / mae

1.5486479201292755

In [22]:
def get_recommendations_svd(user_id, all_items, events, model, include_seen=True, n=5):

    """ возвращает n рекомендаций для user_id """
    
    # получим список идентификаторов всех книг
    all_items = set(events['item_id'].unique())
        
    # учитываем флаг, стоит ли уже прочитанные книги включать в рекомендации
    if include_seen:
        items_to_predict = list(all_items)
    else:
        # получим список книг, которые пользователь уже прочитал ("видел")
        seen_items = set(events[events['rating'].notna()]['item_id'].unique())
        
        # книги, которые пользователь ещё не читал
        # только их и будем включать в рекомендации
        items_to_predict = list(all_items - seen_items)
    
    # получаем скоры для списка книг, т. е. рекомендации
    predictions = [model.predict(user_id, item_id) for item_id in items_to_predict]
    
    # сортируем рекомендации по убыванию скора и берём только n первых
    recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    
    return pd.DataFrame([(pred.iid, pred.est) for pred in recommendations], columns=["item_id", "score"]) 

In [23]:
get_recommendations_svd(1296647, items, events_train, svd_model, include_seen=True) 

Unnamed: 0,item_id,score
0,24812,4.973414
1,22037424,4.927008
2,11221285,4.908364
3,33353628,4.881724
4,54741,4.866328


Выведем последние события для случайного пользователя

In [24]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = events_train['user_id'].sample().iat[0]

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_svd(user_id, items, events_train, svd_model)
user_recommendations = user_recommendations.merge(items[["item_id", "author", "title", "genre_and_votes"]], on="item_id")
display(user_recommendations)

user_id: 415d108afffbba9c2b2599430da64b49
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
0,Patrick Rothfuss,"The Wise Man's Fear (The Kingkiller Chronicle,...",2015-06-17,2015-08-31,5,"{'Fantasy': 16491, 'Fiction': 2222, 'Fantasy-E..."
1,Ken Follett,"Fall of Giants (The Century Trilogy, #1)",2016-05-12,2016-08-30,4,"{'Historical-Historical Fiction': 4665, 'Ficti..."
2,Brandon Sanderson,"The Way of Kings (The Stormlight Archive, #1)",2015-08-31,2015-10-30,5,"{'Fantasy': 14291, 'Fiction': 1623, 'Fantasy-E..."
3,Andy Weir,The Martian,2014-12-07,2014-12-11,4,"{'Science Fiction': 11966, 'Fiction': 8430}"
4,Brandon Sanderson,"Words of Radiance (The Stormlight Archive, #2)",2015-10-30,2016-03-17,5,"{'Fantasy': 8542, 'Fiction': 872, 'Fantasy-Epi..."


Рекомендации


Unnamed: 0,item_id,score,author,title,genre_and_votes
0,22037424,5.0,"J.K. Rowling, Jonny Duddle, Tomislav Tomić",Harry Potter and the Prisoner of Azkaban (Harr...,"{'Fantasy': 49994, 'Young Adult': 15433, 'Fict..."
1,11221285,5.0,Brandon Sanderson,"The Way of Kings, Part 2 (The Stormlight Archi...","{'Fantasy': 641, 'Fiction': 46, 'Fantasy-Epic ..."
2,19219646,5.0,Wolfgang Herrndorf,Arbeit und Struktur,"{'Nonfiction': 25, 'European Literature-German..."
3,280111,4.986681,Anonymous,Holy Bible: New International Version,"{'Religion': 422, 'Christian': 386, 'Nonfictio..."
4,2168850,4.985542,"محمد بن إدريس الشافعي, إميل بديع يعقوب",ديوان الإمام الشافعي,"{'Poetry': 93, 'Religion': 15, 'Literature': 1..."


# === Базовые подходы: коллаборативная фильтрация

In [7]:
import scipy
import sklearn.preprocessing

# перекодируем идентификаторы пользователей: 
# из имеющихся в последовательность 0, 1, 2, ...
user_encoder = sklearn.preprocessing.LabelEncoder()
user_encoder.fit(events["user_id"])
events_train.loc[:, "user_id_enc"] = user_encoder.transform(events_train["user_id"])
events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])

# перекодируем идентификаторы объектов: 
# из имеющихся в последовательность 0, 1, 2, ...
item_encoder = sklearn.preprocessing.LabelEncoder()
item_encoder.fit(items["item_id"])
items["item_id_enc"] = item_encoder.transform(items["item_id"])
events_train["item_id_enc"] = item_encoder.transform(events_train["item_id"])
events_test["item_id_enc"] = item_encoder.transform(events_test["item_id"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["user_id_enc"] = user_encoder.transform(events_test["user_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["item_id_enc"] = item_encoder.transform(events_train["item_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["item_id_enc"] = item_encoder.transform

In [26]:
events_train["item_id_enc"].max()

43304

In [27]:
# создаём sparse-матрицу формата CSR 
user_item_matrix_train = scipy.sparse.csr_matrix((
    events_train["rating"],
    (events_train['user_id_enc'], events_train['item_id_enc'])),
    dtype=np.int8) 

In [28]:
import sys

sum([sys.getsizeof(i) for i in user_item_matrix_train.data])/1024**3 

0.26370687410235405

In [29]:
from implicit.als import AlternatingLeastSquares

als_model = AlternatingLeastSquares(factors=50, iterations=50, regularization=0.05, random_state=0)
als_model.fit(user_item_matrix_train)

  from .autonotebook import tqdm as notebook_tqdm
  check_blas_config()
100%|██████████| 50/50 [03:04<00:00,  3.69s/it]


Получение рекомендаций для конкретного пользователя

In [30]:
def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=True, n=5):
    """
    Возвращает отранжированные рекомендации для заданного пользователя
    """
    user_id_enc = user_encoder.transform([user_id])[0]
    recommendations = model.recommend(
         user_id_enc, 
         user_item_matrix[user_id_enc], 
         filter_already_liked_items=not include_seen,
         N=n)
    recommendations = pd.DataFrame({"item_id_enc": recommendations[0], "score": recommendations[1]})
    recommendations["item_id"] = item_encoder.inverse_transform(recommendations["item_id_enc"])
    
    return recommendations

In [31]:
user_item_matrix_test = scipy.sparse.csr_matrix((
    events_test["rating"],
    (events_test['user_id_enc'], events_test['item_id_enc'])),
    dtype=np.int8) 

In [32]:
get_recommendations_als(user_item_matrix_test, als_model, '00000377eea48021d3002730d56aca9a', user_encoder, item_encoder, include_seen=True)

Unnamed: 0,item_id_enc,score,item_id
0,2,0.990941,3
1,1942,0.896617,15881
2,3,0.864404,5
3,4,0.822254,6
4,1,0.774095,2


In [33]:
# получаем список всех возможных user_id (перекодированных)
user_ids_encoded = range(len(user_encoder.classes_))

# получаем рекомендации для всех пользователей
als_recommendations = als_model.recommend(
    user_ids_encoded, 
    user_item_matrix_train[user_ids_encoded], 
    filter_already_liked_items=False, N=100)

In [34]:
als_recommendations

(array([[    2,  1942,     3, ..., 28836, 30688, 10393],
        [31432, 29792, 36956, ...,   533, 32060, 34554],
        [35810, 33276, 37255, ..., 31562, 41459,  1043],
        ...,
        [20997, 20386, 23004, ...,  2293, 28200, 29560],
        [22844, 28025, 37138, ..., 37914,   422,  4112],
        [41809, 34434, 35669, ..., 33675, 28263, 22072]], dtype=int32),
 array([[0.99094146, 0.89661723, 0.8644041 , ..., 0.2261226 , 0.22548363,
         0.22546645],
        [0.674292  , 0.6229848 , 0.49019852, ..., 0.02235501, 0.02226192,
         0.02225844],
        [0.24119437, 0.22116913, 0.18066649, ..., 0.04201685, 0.04178948,
         0.04172034],
        ...,
        [0.23566297, 0.23407641, 0.22276123, ..., 0.02843785, 0.02830932,
         0.02820013],
        [0.05539129, 0.03866215, 0.03835723, ..., 0.01568658, 0.01557466,
         0.01546565],
        [0.47294533, 0.46393558, 0.4604288 , ..., 0.09494869, 0.09492695,
         0.09303415]], dtype=float32))

In [35]:
# преобразуем полученные рекомендации в табличный формат
item_ids_enc = als_recommendations[0]
als_scores = als_recommendations[1]

als_recommendations = pd.DataFrame({
    "user_id_enc": user_ids_encoded,
    "item_id_enc": item_ids_enc.tolist(), 
    "score": als_scores.tolist()})
als_recommendations = als_recommendations.explode(["item_id_enc", "score"], ignore_index=True)

# приводим типы данных
als_recommendations["item_id_enc"] = als_recommendations["item_id_enc"].astype("int")
als_recommendations["score"] = als_recommendations["score"].astype("float")

# получаем изначальные идентификаторы
als_recommendations["user_id"] = user_encoder.inverse_transform(als_recommendations["user_id_enc"])
als_recommendations["item_id"] = item_encoder.inverse_transform(als_recommendations["item_id_enc"])
als_recommendations = als_recommendations.drop(columns=["user_id_enc", "item_id_enc"])

In [36]:
als_recommendations

Unnamed: 0,score,user_id,item_id
0,0.990941,00000377eea48021d3002730d56aca9a,3
1,0.896617,00000377eea48021d3002730d56aca9a,15881
2,0.864404,00000377eea48021d3002730d56aca9a,5
3,0.822254,00000377eea48021d3002730d56aca9a,6
4,0.774095,00000377eea48021d3002730d56aca9a,2
...,...,...,...
43058495,0.096082,fffff8a718843c0e11dfd93fb41c1297,13206900
43058496,0.096065,fffff8a718843c0e11dfd93fb41c1297,5060378
43058497,0.094949,fffff8a718843c0e11dfd93fb41c1297,16071764
43058498,0.094927,fffff8a718843c0e11dfd93fb41c1297,9969571


In [37]:
als_recommendations = als_recommendations[["user_id", "item_id", "score"]]
als_recommendations.to_parquet("als_recommendations.parquet")

Для удобства оценивания добавим истинные оценки пользователей

In [38]:
als_recommendations = (
    als_recommendations
    .merge(events_test[["user_id", "item_id", "rating"]]
               .rename(columns={"rating": "rating_test"}), 
           on=["user_id", "item_id"], how="left")
) 

In [39]:
import sklearn.metrics

def compute_ndcg(rating: pd.Series, score: pd.Series, k):

    """ подсчёт ndcg
    rating: истинные оценки
    score: оценки модели
    k: количество айтемов (по убыванию score) для оценки, остальные - отбрасываются
    """
    
    # если кол-во объектов меньше 2, то NDCG - не определена
    if len(rating) < 2:
        return np.nan

    ndcg = sklearn.metrics.ndcg_score(np.asarray([rating.to_numpy()]), np.asarray([score.to_numpy()]), k=k)

    return ndcg

In [40]:
rating_test_idx = ~als_recommendations["rating_test"].isnull()
ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5)) 

  ndcg_at_5_scores = als_recommendations[rating_test_idx].groupby("user_id").apply(lambda x: compute_ndcg(x["rating_test"], x["score"], k=5))


In [41]:
print(ndcg_at_5_scores.mean()) 

0.975946709792109


# === Базовые подходы: контентные рекомендации

In [10]:
items["genre_and_votes"] = items["genre_and_votes"].apply(eval)

In [11]:
def get_genres(items):

    """ 
    извлекает список жанров по всем книгам, 
    подсчитывает долю голосов по каждому их них
    """
    
    genres_counter = {}
    
    for k, v, in items.iterrows():
        genre_and_votes = v["genre_and_votes"]
        if genre_and_votes is None or not isinstance(genre_and_votes, dict):
            continue
        for genre, votes in genre_and_votes.items():
            # увеличиваем счётчик жанров
            try:
                genres_counter[genre] += votes
            except KeyError:
                genres_counter[genre] = 0

    genres = pd.Series(genres_counter, name="votes")
    genres = genres.to_frame()
    genres = genres.reset_index().rename(columns={"index": "name"})
    genres.index.name = "genre_id"
    
    return genres
   
genres = get_genres(items) 

In [12]:
genres["score"] = genres["votes"] / genres["votes"].sum()
genres.sort_values(by="score", ascending=False).head(10) 

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25,Fantasy,6850060,0.149651
1,Fiction,6406256,0.139955
38,Classics,3414934,0.074605
18,Young Adult,3296951,0.072027
34,Romance,2422614,0.052926
5,Nonfiction,1737406,0.037957
16,Historical-Historical Fiction,1531205,0.033452
20,Mystery,1371196,0.029956
24,Science Fiction,1218917,0.026629
33,Fantasy-Paranormal,857012,0.018723


In [13]:
def get_item2genre_matrix(genres, items):

    genre_names_to_id = genres.reset_index().set_index("name")["genre_id"].to_dict()
    
    # list to build CSR matrix
    genres_csr_data = []
    genres_csr_row_idx = []
    genres_csr_col_idx = []
    
    for item_idx, (k, v) in enumerate(items.iterrows()):
        if v["genre_and_votes"] is None:
            continue
        for genre_name, votes in v["genre_and_votes"].items():
            genre_idx = genre_names_to_id[genre_name]
            genres_csr_data.append(int(votes))
            genres_csr_row_idx.append(item_idx)
            genres_csr_col_idx.append(genre_idx)

    genres_csr = scipy.sparse.csr_matrix((genres_csr_data, (genres_csr_row_idx, genres_csr_col_idx)), shape=(len(items), len(genres)))
    # нормализуем, чтобы сумма оценок принадлежности к жанру была равна 1
    genres_csr = sklearn.preprocessing.normalize(genres_csr, norm='l1', axis=1)
    
    return genres_csr

In [14]:
items = items.sort_values(by="item_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items)

Проба

In [47]:
user_id = 100010
user_events = events_train.query("user_id_enc == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]

user_items_genres_csr = get_item2genre_matrix(genres, items[items["item_id"].isin(events[events["user_id"] == user_id]["item_id"])])
user_items_genres_csr

ValueError: Found array with 0 sample(s) (shape=(0, 815)) while a minimum of 1 is required by the normalize function.

In [15]:
user_id = 10
user_events = events_train.query("user_id_enc == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]

user_items_genres_csr = get_item2genre_matrix(genres, user_items)
user_items_genres_csr

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 149 stored elements and shape (22, 815)>

In [16]:
# вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

# преобразуем пользовательские оценки из списка в вектор-столбец
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0))

In [17]:
# выведем список жанров, которые предпочитает пользователь

user_genres = genres.copy()
user_genres["score"] = np.ravel(user_genres_scores)
user_genres = user_genres[user_genres["score"] > 0].sort_values(by=["score"], ascending=False)

user_genres.head(5)

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Fiction,6406256,0.188691
38,Classics,3414934,0.106022
25,Fantasy,6850060,0.078959
5,Nonfiction,1737406,0.048495
24,Science Fiction,1218917,0.040443


In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# вычисляем сходство между вектором пользователя и векторами по книгам
similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)

# преобразуем в одномерный массив
similarity_scores = similarity_scores.flatten()

# получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)
k = 5
top_k_indices = np.argsort(similarity_scores)[::-1][:k]

In [22]:
top_k_indices

array([ 4471, 14087,  4460, 36093,  1120])

In [23]:
selected_items = items[items["item_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
   display(selected_items[["author", "title", "genre_and_votes"]])

Unnamed: 0,author,title,genre_and_votes
564712,Ray Bradbury,"Farewell Summer (Green Town, #3)","{'Fiction': 170, 'Fantasy': 72, 'Science Fiction': 72, 'Classics': 52}"
80465,G.K. Chesterton,The Napoleon of Notting Hill,"{'Fiction': 166, 'Classics': 88, 'Fantasy': 44, 'Humor': 22, 'Literature': 20}"
1168335,Ray Bradbury,"Dandelion Wine (Green Town, #1)","{'Fiction': 1438, 'Classics': 914, 'Science Fiction': 529, 'Fantasy': 456, 'Young Adult': 212}"
2244467,Samuel Butler,"Erewhon (Erewhon , #1)","{'Fiction': 162, 'Classics': 139, 'Science Fiction': 60, 'Fantasy': 55}"
39408,"Paulo Coelho, Alan R. Clarke, James Noel Smith",The Alchemist,"{'Fiction': 14023, 'Classics': 5787, 'Fantasy': 3289, 'Philosophy': 2759}"


# === Базовые подходы: валидация

In [48]:
def process_events_recs_for_binary_metrics(events_train, events_test, recs, top_k=None):

    """
    размечает пары <user_id, item_id> для общего множества пользователей признаками
    - gt (ground truth)
    - pr (prediction)
    top_k: расчёт ведётся только для top k-рекомендаций
    """

    events_test["gt"] = True
    common_users = set(events_test["user_id"]) & set(recs["user_id"])

    print(f"Common users: {len(common_users)}")
    
    events_for_common_users = events_test[events_test["user_id"].isin(common_users)].copy()
    recs_for_common_users = recs[recs["user_id"].isin(common_users)].copy()

    recs_for_common_users = recs_for_common_users.sort_values(["user_id", "score"], ascending=[True, False])

    # оставляет только те item_id, которые были в events_train, 
    # т. к. модель не имела никакой возможности давать рекомендации для новых айтемов
    events_for_common_users = events_for_common_users[events_for_common_users["item_id"].isin(events_train["item_id"].unique())]

    if top_k is not None:
        recs_for_common_users = recs_for_common_users.groupby("user_id").head(top_k)
    
    events_recs_common = events_for_common_users[["user_id", "item_id", "gt"]].merge(
        recs_for_common_users[["user_id", "item_id", "score"]], 
        on=["user_id", "item_id"], how="outer")    

    events_recs_common["gt"] = events_recs_common["gt"].fillna(False)
    events_recs_common["pr"] = ~events_recs_common["score"].isnull()
    
    events_recs_common["tp"] = events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fp"] = ~events_recs_common["gt"] & events_recs_common["pr"]
    events_recs_common["fn"] = events_recs_common["gt"] & ~events_recs_common["pr"]

    return events_recs_common

In [55]:
events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_test["gt"] = True


Common users: 123223


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


In [49]:
def compute_cls_metrics(events_recs_for_binary_metric):
    
    groupper = events_recs_for_binary_metric.groupby("user_id")

    # precision = tp / (tp + fp)
    precision = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fp"].sum())
    precision = precision.fillna(0).mean()
    
    # recall = tp / (tp + fn)
    recall = groupper["tp"].sum()/(groupper["tp"].sum()+groupper["fn"].sum())
    recall = recall.fillna(0).mean()

    return precision, recall 

In [57]:
compute_cls_metrics(events_recs_for_binary_metrics)

(0.007581376853347184, 0.014121568795222568)

# === Двухстадийный подход: метрики

In [58]:
# расчёт покрытия по объектам
cov_items = als_recommendations["item_id"].nunique()/items["item_id"].nunique()
print(f"{cov_items:.2f}")

0.09


In [24]:
# разметим каждую рекомендацию признаком read
events_train["read"] = True
als_recommendations = als_recommendations.merge(events_train[["read", "user_id", "item_id"]], on=["user_id", "item_id"], how="left")
als_recommendations["read"] = als_recommendations["read"].fillna(False).astype("bool")

# проставим ранги
als_recommendations = als_recommendations.sort_values("score", ascending=False)
als_recommendations["rank"] = als_recommendations.groupby("user_id").cumcount() + 1

# посчитаем novelty по пользователям
novelty_5 = (1-als_recommendations.query("rank <= 5").groupby("user_id")["read"].mean())

# посчитаем средний novelty
novelty_5.mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["read"] = True


NameError: name 'als_recommendations' is not defined

# === Двухстадийный подход: модель

In [25]:
# задаём точку разбиения
split_date_for_labels = pd.to_datetime("2017-09-15").date()

split_date_for_labels_idx = events_test["started_at"] < split_date_for_labels
events_labels = events_test[split_date_for_labels_idx].copy()
events_test_2 = events_test[~split_date_for_labels_idx].copy()

In [26]:
events_labels["user_id"].nunique()

99849

In [31]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations = pd.read_parquet("candidates/training/als_recommendations.parquet")
content_recommendations = pd.read_parquet("candidates/training/content_recommendations.parquet")

candidates = pd.merge(
    als_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer")

In [27]:
events_labels["user_id_enc"] = events_labels["user_id_enc"] + 1000000

In [32]:
candidates.rename(columns={"user_id": "user_id_enc"}, inplace=True)

In [65]:
events_labels

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,started_at_month,user_id_enc,item_id_enc,gt
2478089,00014c578111090720e20f5705eba051,18812405,2017-08-05,2017-08-19,True,3,False,2017-08-01,1000006,37138,True
2478098,00014c578111090720e20f5705eba051,29868610,2017-08-30,2017-09-16,True,4,False,2017-08-01,1000006,42017,True
2478104,00014c578111090720e20f5705eba051,7445,2017-08-26,2017-08-30,True,4,False,2017-08-01,1000006,868,True
11383749,000157a6f8331e9c9a21252e1fee91d1,168642,2017-09-05,2017-09-17,True,5,True,2017-09-01,1000007,9074,True
11383750,000157a6f8331e9c9a21252e1fee91d1,28257707,2017-08-16,2017-09-06,True,4,True,2017-08-01,1000007,41610,True
...,...,...,...,...,...,...,...,...,...,...,...
1285201,ffff601c0ffa34bd5ffbbf2caee30644,269362,2017-08-30,2017-08-30,True,4,False,2017-08-01,1430578,11072,True
1285200,ffff601c0ffa34bd5ffbbf2caee30644,830502,2017-09-09,2017-10-11,True,5,False,2017-09-01,1430578,16489,True
1941753,ffff7cafdaf5196383cb2efca08fb6fe,27272506,2017-09-03,2017-10-07,True,3,True,2017-09-01,1430579,41386,True
5625380,fffff8a718843c0e11dfd93fb41c1297,23395680,2017-08-09,2017-08-15,True,4,True,2017-08-01,1430584,39645,True


In [66]:
candidates

Unnamed: 0,user_id_enc,item_id,als_score,cnt_score
0,1000000,1,0.756692,0.933434
1,1000000,2,0.792929,0.925806
2,1000000,3,0.972557,0.920225
3,1000000,5,0.865850,0.918026
4,1000000,6,0.834282,0.916345
...,...,...,...,...
82993089,1430584,31327371,,0.786363
82993090,1430584,32841355,,0.784905
82993091,1430584,33828743,,0.784706
82993092,1430584,34037113,,0.784556


In [33]:
# добавляем таргет к кандидатам со значением:
# — 1 для тех item_id, которые пользователь прочитал
# — 0, для всех остальных 

events_labels["target"] = 1
candidates = candidates.merge(events_labels[["user_id_enc", "item_id", "target"]], 
                              on=["user_id_enc", "item_id"], how="left")
candidates["target"] = candidates["target"].fillna(0).astype("int")

# в кандидатах оставляем только тех пользователей, у которых есть хотя бы один положительный таргет
candidates_to_sample = candidates.groupby("user_id_enc").filter(lambda x: x["target"].sum() > 0)

# для каждого пользователя оставляем только 4 негативных примера
negatives_per_user = 4
candidates_for_train = pd.concat([
        candidates_to_sample.query("target == 1"),
        candidates_to_sample.query("target == 0") \
        .groupby("user_id_enc") \
        .apply(lambda x: x.sample(negatives_per_user, random_state=0)),
    ])

  .apply(lambda x: x.sample(negatives_per_user, random_state=0)),


In [68]:
candidates

Unnamed: 0,user_id_enc,item_id,als_score,cnt_score,target
0,1000000,1,0.756692,0.933434,0
1,1000000,2,0.792929,0.925806,0
2,1000000,3,0.972557,0.920225,0
3,1000000,5,0.865850,0.918026,0
4,1000000,6,0.834282,0.916345,0
...,...,...,...,...,...
82993089,1430584,31327371,,0.786363,0
82993090,1430584,32841355,,0.784905,0
82993091,1430584,33828743,,0.784706,0
82993092,1430584,34037113,,0.784556,0


In [69]:
candidates_to_sample

Unnamed: 0,user_id_enc,item_id,als_score,cnt_score,target
1169,1000006,1,0.824286,,0
1170,1000006,2,0.842997,,0
1171,1000006,3,0.929247,,0
1172,1000006,5,0.887194,,0
1173,1000006,6,0.872281,,0
...,...,...,...,...,...
82993089,1430584,31327371,,0.786363,0
82993090,1430584,32841355,,0.784905,0
82993091,1430584,33828743,,0.784706,0
82993092,1430584,34037113,,0.784556,0


In [70]:
candidates_for_train

Unnamed: 0,user_id_enc,item_id,als_score,cnt_score,target
1184,1000006,7445,0.230529,,1
1316,1000006,18812405,0.178382,,1
1360,1000006,29868610,0.286715,,1
3725,1000019,37415,0.043595,,1
4500,1000023,5094,0.082626,,1
...,...,...,...,...,...
"(1430579, 82992097)",1430579,23009402,0.721888,,0
"(1430584, 82992912)",1430584,33555,,0.841958,0
"(1430584, 82993064)",1430584,24378015,0.126203,,0
"(1430584, 82993000)",1430584,9736930,0.116841,,0


In [34]:
len(candidates_for_train)

213708

In [35]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score']
target = 'target'

# Create the Pool object
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0
)

# тренируем модель
cb_model.fit(train_data)

0:	learn: 0.6490473	total: 78.5ms	remaining: 1m 18s
100:	learn: 0.5023899	total: 1.94s	remaining: 17.2s
200:	learn: 0.5015905	total: 3.91s	remaining: 15.5s
300:	learn: 0.5008853	total: 5.9s	remaining: 13.7s
400:	learn: 0.5002944	total: 7.86s	remaining: 11.7s
500:	learn: 0.4997685	total: 9.84s	remaining: 9.8s
600:	learn: 0.4992607	total: 11.8s	remaining: 7.84s
700:	learn: 0.4988429	total: 13.8s	remaining: 5.9s
800:	learn: 0.4984777	total: 15.8s	remaining: 3.94s
900:	learn: 0.4981234	total: 17.9s	remaining: 1.96s
999:	learn: 0.4977696	total: 19.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7faa6c70a890>

In [29]:
events_test_2["user_id_enc"] = events_test_2["user_id_enc"] + 1000000

In [30]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations_2 = pd.read_parquet("candidates/inference/als_recommendations.parquet")
content_recommendations_2 = pd.read_parquet("candidates/inference/content_recommendations.parquet")

candidates_to_rank = pd.merge(
    als_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer"
    )

# оставляем только тех пользователей, что есть в тестовой выборке, для экономии ресурсов
candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test_2["user_id_enc"].drop_duplicates())]
print(len(candidates_to_rank))

14517152


In [40]:
candidates_to_rank

Unnamed: 0,user_id,item_id,als_score,cnt_score
588,1000003,1232,0.484089,
589,1000003,3636,0.579718,
590,1000003,3985,,0.924399
591,1000003,4214,0.376419,
592,1000003,4588,0.411577,
...,...,...,...,...
83152118,1430580,29844228,0.028825,0.947815
83152119,1430580,30226723,0.020854,
83152120,1430580,32075671,,0.947480
83152121,1430580,33795874,,0.947473


In [46]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставляем rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.groupby("user_id").head(max_recommendations_per_user)

In [47]:
final_recommendations

Unnamed: 0,user_id,item_id,als_score,cnt_score,cb_score,rank
755,1000003,13538873,0.627915,,0.517430,1
699,1000003,5060378,0.770063,,0.486272,2
709,1000003,6892870,0.767938,,0.486272,3
705,1000003,6493208,0.680771,,0.484585,4
754,1000003,13526165,0.449987,,0.483984,5
...,...,...,...,...,...,...
83151942,1430580,7445,0.015793,,0.246999,96
83152011,1430580,6314763,0.016404,,0.246999,97
83152050,1430580,11710373,0.016035,,0.246999,98
83152078,1430580,17802724,0.015771,,0.246999,99


In [55]:
events_train["user_id_enc"] = events_train["user_id_enc"] + 1000000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["user_id_enc"] = events_train["user_id_enc"] + 1000000


In [68]:
events_train["user_id"] = events_train["user_id_enc"]
events_labels["user_id"] = events_labels["user_id_enc"]
events_test_2["user_id"] = events_test_2["user_id_enc"].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events_train["user_id"] = events_train["user_id_enc"]


In [70]:
events_inference = pd.concat([events_train, events_labels])

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2,
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}")

Common users: 75194
precision: 0.007, recall: 0.016


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


# === Двухстадийный подход: построение признаков

In [None]:
items["age"] = 2018-items["publication_year"]
invalid_age_idx = items["age"] < 0
items.loc[invalid_age_idx, "age"] = np.nan
items["age"] = items["age"].astype("float")

candidates_for_train = candidates_for_train.merge(# ваш код здесь #)
candidates_to_rank = candidates_to_rank.merge(# ваш код здесь #)