# Подготовка

## Initialization

In [104]:
import logging

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [105]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.figure_format = 'retina'

## Загрузка данных

In [106]:
items = pd.read_parquet("./data/transformed/items.par")
events = pd.read_parquet("./data/transformed/events.par")

In [107]:
events.head(5)

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,user_idx
0,8842281e1d1347389f2ab93d60773d4d,22034,2015-07-12,2015-07-17,True,5,False,1229132
1,8842281e1d1347389f2ab93d60773d4d,22318578,2015-06-07,2015-08-09,True,5,True,1229132
2,8842281e1d1347389f2ab93d60773d4d,22551730,2015-06-24,2015-07-11,True,4,True,1229132
3,8842281e1d1347389f2ab93d60773d4d,22816087,2015-09-27,2015-11-04,True,5,True,1229132
5,8842281e1d1347389f2ab93d60773d4d,17910054,2015-03-04,2015-07-28,True,3,False,1229132


## Разбиение с учётом хронологии

Рекомендательные системы на практике работают с учётом хронологии. Поэтому поток событий для тренировки и валидации полезно делить на то, что уже случилось, и что ещё случится. Это позволяет проводить валидацию на тех же пользователях, на которых тренировались, но на их событиях в будущем.

In [108]:
event_cutoff_date = pd.to_datetime("2017-08-01").date()
events_train = events.query("started_at < @event_cutoff_date").copy()
events_test = events.query("started_at >= @event_cutoff_date").copy()
users_train = set(events_train["user_id"].unique())
users_test = set(events_test["user_id"].unique())
common_users = set(users_train & users_test)
print(len(users_train), len(users_test), len(common_users)) 

428220 123223 120858


# Знакомство: "холодный" старт

In [109]:
# cold users
cold_users = set(users_test - users_train)
print(len(cold_users))

2365


In [110]:
from sklearn.preprocessing import MinMaxScaler

top_pop_start_date = pd.to_datetime("2015-01-01").date()

item_popularity = events_train \
    .query("started_at >= @top_pop_start_date") \
    .groupby(["item_id"]).agg(users=("user_id", "nunique"), avg_rating=("rating", "mean")).reset_index()

# нормализация пользователей и среднего рейтинга, требуется для их приведения к одному масштабу
scaler = MinMaxScaler()
item_popularity[["users_norm", "avg_rating_norm"]] = scaler.fit_transform(
    item_popularity[["users", "avg_rating"]]
)

# вычисляем popularity_score, как скор популярности со штрафом за низкий рейтинг
item_popularity["popularity_score"] = (
    item_popularity["users_norm"] * item_popularity["avg_rating_norm"]
)

# сортируем по убыванию popularity_score
item_popularity = item_popularity.sort_values("popularity_score", ascending=False)

# выбираем первые 100 айтемов со средней оценкой avg_rating не меньше 4
top_k_pop_items = item_popularity.query("avg_rating >= 4").head(100)
top_k_pop_items.head()

Unnamed: 0,item_id,users,avg_rating,users_norm,avg_rating_norm,popularity_score
32387,18007564,20207,4.321275,0.496596,0.830319,0.412333
32623,18143977,19462,4.290669,0.478287,0.822667,0.393471
2,3,15139,4.706057,0.372042,0.926514,0.344702
30695,16096824,16770,4.301014,0.412126,0.825253,0.340108
1916,15881,13043,4.632447,0.320529,0.908112,0.291076


In [111]:
# добавляем информацию о книгах
top_k_pop_items = top_k_pop_items.merge(
    items.set_index("item_id")[["author", "title", "genre_and_votes", "publication_year"]], on="item_id")

with pd.option_context('display.max_rows', 10):
    display(top_k_pop_items[["item_id", "author", "title", "publication_year", "users", "avg_rating", "popularity_score", "genre_and_votes"]])

Unnamed: 0,item_id,author,title,publication_year,users,avg_rating,popularity_score,genre_and_votes
0,18007564,Andy Weir,The Martian,2014,20207,4.321275,0.412333,"{'Science Fiction': 11966, 'Fiction': 8430}"
1,18143977,Anthony Doerr,All the Light We Cannot See,2014,19462,4.290669,0.393471,"{'Historical-Historical Fiction': 13679, 'Fict..."
2,3,"J.K. Rowling, Mary GrandPré",Harry Potter and the Sorcerer's Stone (Harry P...,1997,15139,4.706057,0.344702,"{'Fantasy': 59818, 'Fiction': 17918, 'Young Ad..."
3,16096824,Sarah J. Maas,A Court of Thorns and Roses (A Court of Thorns...,2015,16770,4.301014,0.340108,"{'Fantasy': 14326, 'Young Adult': 4662, 'Roman..."
4,15881,"J.K. Rowling, Mary GrandPré",Harry Potter and the Chamber of Secrets (Harry...,1999,13043,4.632447,0.291076,"{'Fantasy': 50130, 'Young Adult': 15202, 'Fict..."
...,...,...,...,...,...,...,...,...
95,8490112,Laini Taylor,Daughter of Smoke & Bone (Daughter of Smoke & ...,2011,4792,4.080968,0.090694,"{'Fantasy': 11681, 'Young Adult': 7110, 'Roman..."
96,18966819,Pierce Brown,"Golden Son (Red Rising, #2)",2015,4361,4.374914,0.090409,"{'Science Fiction': 2613, 'Fantasy': 1372, 'Sc..."
97,3636,Lois Lowry,"The Giver (The Giver, #1)",2006,4667,4.098564,0.088832,"{'Young Adult': 10993, 'Fiction': 9045, 'Class..."
98,18293427,Gabrielle Zevin,The Storied Life of A.J. Fikry,2014,4674,4.092640,0.088795,"{'Fiction': 3795, 'Contemporary': 1100, 'Writi..."


In [112]:
cold_users_events_with_recs = \
    events_test[events_test["user_id"].isin(cold_users)] \
    .merge(top_k_pop_items, on="item_id", how="left")

cold_user_items_no_avg_rating_idx = cold_users_events_with_recs["avg_rating"].isnull()
cold_user_recs = cold_users_events_with_recs[~cold_user_items_no_avg_rating_idx] \
    [["user_id", "item_id", "rating", "avg_rating"]]
print(len(cold_user_recs) /  len(cold_users_events_with_recs))

0.19768403639371382


In [113]:

from sklearn.metrics import mean_absolute_error, root_mean_squared_error


rmse = root_mean_squared_error(cold_user_recs["rating"], cold_user_recs["avg_rating"])
mae = mean_absolute_error(cold_user_recs["rating"], cold_user_recs["avg_rating"])
print(round(rmse, 2), round(mae, 2)) 

0.78 0.62


In [114]:
# посчитаем покрытие холодных пользователей рекомендациями

cold_users_hit_ratio = cold_users_events_with_recs.groupby("user_id").agg(hits=("avg_rating", lambda x: (~x.isnull()).mean()))

print(f"Доля пользователей без релевантных рекомендаций: {(cold_users_hit_ratio == 0).mean().iat[0]:.2f}")
print(f"Среднее покрытие пользователей: {cold_users_hit_ratio[cold_users_hit_ratio != 0].mean().iat[0]:.2f}")

Доля пользователей без релевантных рекомендаций: 0.59
Среднее покрытие пользователей: 0.44


# Знакомство: первые персональные рекомендации

# Базовые подходы: коллаборативная фильтрация

In [115]:
# ~3-5 minutes
from surprise import Dataset, Reader
from surprise import SVD

# используем Reader из библиотеки surprise для преобразования событий (events)
# в формат, необходимый surprise
reader = Reader(rating_scale=(1, 5))
surprise_train_set = Dataset.load_from_df(events_train[['user_id', 'item_id', 'rating']], reader)
surprise_train_set = surprise_train_set.build_full_trainset()

# инициализируем модель
svd_model = SVD(n_factors=100, random_state=0)

# обучаем модель
svd_model.fit(surprise_train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f514ed7d30>

In [116]:
surprise_test_set = list(events_test[['user_id', 'item_id', 'rating']].itertuples(index=False))

# получаем рекомендации для тестовой выборки
svd_predictions = svd_model.test(surprise_test_set)

In [117]:
from surprise import accuracy

rmse = accuracy.rmse(svd_predictions)
mae = accuracy.mae(svd_predictions)
print(rmse, mae)

RMSE: 0.8289
MAE:  0.6474
0.8288711689059135 0.647437483750257


In [118]:
from surprise import NormalPredictor, accuracy

# инициализируем состояние генератора, это необходимо для получения
# одной и той же последовательности случайных чисел, только в учебных целях
np.random.seed(0)

random_model = NormalPredictor()

random_model.fit(surprise_train_set)
random_predictions = random_model.test(surprise_test_set)
random_rmse = accuracy.rmse(random_predictions)
random_mae = accuracy.mae(random_predictions)
print(random_rmse, random_mae)

RMSE: 1.2628
MAE:  1.0018
1.2628030301013033 1.0017726877569562


In [119]:
(random_mae - mae) / mae * 100

54.72886771309964

### Факультативное задание
Задание. Удалите из events события для редких айтемов — таких, с которыми взаимодействовало менее N пользователей. Возьмите небольшое N, например 2–3 пользователя. Получите рекомендации, посчитайте метрики, оцените, как они изменились. Подумайте, с чем могут быть связаны такие изменения.

In [120]:
def get_recommendations_svd(user_id, all_items, events, model, include_seen=True, n=5):
    """
    возвращает n рекомендаций для user_id
    """
    
    # получим список идентификаторов всех книг
    all_items = set(events['item_id'])
        
    # учитываем флаг, стоит ли уже прочитанные книги включать в рекомендации
    if include_seen:
        items_to_predict = list(all_items)
    else:
        # получим список книг, которые пользователь уже прочитал ("видел")
        seen_items = set(events[events["user_id"] == user_id]['item_id'])
        
        # книги, которые пользователь ещё не читал
        # только их и будем включать в рекомендации
        items_to_predict = list(all_items - seen_items)
    
    # получаем скоры для списка книг, т. е. рекомендации
    predictions = [model.predict(user_id, item_id) for item_id in items_to_predict]
    left = [smth for smth in predictions if smth.iid == 31450846]  # 30688013
    print(left)
    
    # сортируем рекомендации по убыванию скора и берём только n первых
    recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    
    return pd.DataFrame([(pred.iid, pred.est) for pred in recommendations], columns=["item_id", "score"])

get_recommendations_svd(1296647, items, events_test, svd_model)

[Prediction(uid=1296647, iid=31450846, r_ui=None, est=4.752795975597794, details={'was_impossible': False})]


Unnamed: 0,item_id,score
0,11221285,4.914296
1,22037424,4.908423
2,33353628,4.872179
3,29844341,4.850003
4,17332218,4.83901


## Дополнительная проверка качества рекомендаций

In [121]:
# выберем произвольного пользователя из тренировочной выборки ("прошлого")
user_id = events_train['user_id'].sample().iat[0]

print(f"user_id: {user_id}")

print("История (последние события, recent)")
user_history = (
    events_train
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)

print("Рекомендации")
user_recommendations = get_recommendations_svd(user_id, items, events_train, svd_model)
user_recommendations = user_recommendations.merge(items[["item_id", "author", "title", "genre_and_votes"]], on="item_id")
display(user_recommendations)

user_id: 648870e2d02a8adcfbf23f78045204dd
История (последние события, recent)


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
68,Veronica Roth,"Divergent (Divergent, #1)",2014-06-02,2014-06-04,4,"{'Young Adult': 20260, 'Science Fiction-Dystop..."
69,"Gillian Flynn, В. Русанов",Gone Girl,2014-05-27,2014-05-29,5,"{'Fiction': 11773, 'Mystery': 9965, 'Thriller'..."
70,Kathy Reichs,"Death du Jour (Temperance Brennan, #2)",2014-05-24,2014-05-27,4,"{'Mystery': 1206, 'Mystery-Crime': 579, 'Ficti..."
71,Chelsea Cain,"Heartsick (Archie Sheridan & Gretchen Lowell, #1)",2014-05-22,2014-05-22,5,"{'Mystery': 832, 'Thriller': 653, 'Fiction': 4..."
72,"Jussi Adler-Olsen, Lisa Hartford","The Keeper of Lost Causes (Department Q, #1)",2014-05-30,2014-06-02,3,"{'Mystery': 1225, 'Mystery-Crime': 627, 'Ficti..."
73,Gillian Flynn,Dark Places,2014-05-17,2014-05-22,4,"{'Mystery': 4534, 'Fiction': 4055, 'Thriller':..."
74,Audrey Niffenegger,Her Fearful Symmetry,2014-05-05,2014-05-08,2,"{'Fiction': 1984, 'Fantasy': 674, 'Fantasy-Par..."
75,Kathy Reichs,"Déjà Dead (Temperance Brennan, #1)",2014-05-13,2014-05-17,4,"{'Mystery': 2141, 'Fiction': 904, 'Mystery-Cri..."
76,Carolyn Parkhurst,The Dogs of Babel,2014-05-09,2014-05-10,5,"{'Fiction': 522, 'Mystery': 102, 'Animals': 77..."
77,George R.R. Martin,"A Dance with Dragons (A Song of Ice and Fire, #5)",2014-05-04,2014-05-04,5,"{'Fantasy': 22247, 'Fiction': 4512, 'Fantasy-E..."


Рекомендации
[Prediction(uid='648870e2d02a8adcfbf23f78045204dd', iid=31450846, r_ui=None, est=5, details={'was_impossible': False})]


Unnamed: 0,item_id,score,author,title,genre_and_votes
0,2199,5,Doris Kearns Goodwin,Team of Rivals: The Political Genius of Abraha...,"{'History': 4174, 'Nonfiction': 2127, 'Biograp..."
1,16255632,5,"David Gaider, Ben Gelinas, Mike Laidlaw, Dave ...",Dragon Age: The World of Thedas Volume 1,"{'Fantasy': 134, 'Games-Video Games': 28, 'Art..."
2,2363958,5,João Guimarães Rosa,Grande Sertão: Veredas,"{'Fiction': 85, 'Classics': 69, 'Cultural-Braz..."
3,22552026,5,Jason Reynolds,Long Way Down,"{'Young Adult': 1871, 'Poetry': 1737, 'Contemp..."
4,29237211,5,"Brian K. Vaughan, Fiona Staples","Saga, Vol. 7 (Saga, #7)","{'Sequential Art-Graphic Novels': 2539, 'Seque..."


## Коллаборативная фильтрация: ALS

### Fit

In [122]:
from data_encoder import DataEncoder
data_encoder = DataEncoder(items, events, events_train, events_test)
data_encoder.fit()
assert data_encoder.train["item_id_enc"].max() == 43304, "Must match theory"

matrix_size = (
            (data_encoder.train["user_id_enc"].max() + 1)
            * (data_encoder.train["item_id_enc"].max() + 1)
            / 1024**3
        )  # in GB
print(f"Expected matrix size is {matrix_size:.2f} GB")

Encoding data...
User encoding took 3.62 seconds
Item encoding took 1.86 seconds
Expected matrix size is 17.37 GB


In [123]:
from als_model import ALSPredictor
predictor = ALSPredictor(data_encoder)
predictor.fit()
als_recommendations = predictor.get_all_recommendations()
matrix_size = predictor.get_train_size()
print(f"Real matrix size {matrix_size:.2f} GB")

User-item matrix construction took 0.48 seconds
Loading ALS model from d:\Work\ya.practicum\ml\sprint-4\mle-recsys-start\data\models\als_model.pkl
Loading ALS recommendations from d:\Work\ya.practicum\ml\sprint-4\mle-recsys-start\data\als_recommendations.parquet
Real matrix size 0.26 GB


In [124]:
# TODO: Используя get_recommendations_als, напишите код, который позволит для случайного пользователя просмотреть рекомендации в удобном формате: 
# история с именами авторов и названием книг,
# рекомендации с именами авторов и названием книг, seen-признаком (взаимодействовал ли уже пользователь с рекомендованной книгой).
# Проанализируйте, релевантны ли рекомендации имеющейся истории.
user_id = events_test['user_id'].sample().iat[0]
print(f"user_id: {user_id}")
user_history = (
    events_test
    .query("user_id == @user_id")
    .merge(items.set_index("item_id")[["author", "title", "genre_and_votes"]], on="item_id")
    .sort_values("started_at", ascending=False)
)
user_history_to_print = user_history[["author", "title", "started_at", "read_at", "rating", "genre_and_votes"]].tail(10)
display(user_history_to_print)
user_recommendations = predictor.recommend(user_id)
user_recommendations = user_recommendations.merge(items[["item_id", "author", "title", "genre_and_votes"]], on="item_id")
user_recommendations["seen"] = user_recommendations["item_id"].isin(user_history["item_id"])
display(user_recommendations[["author", "title", "seen", "score", "genre_and_votes"]])

user_id: 2b75d48b744b26894bc86e70b9ebc565


Unnamed: 0,author,title,started_at,read_at,rating,genre_and_votes
0,Rupi Kaur,Milk and Honey,2017-09-30,2017-10-09,5,"{'Poetry': 14490, 'Feminism': 1105, 'Nonfictio..."
2,Sarah J. Maas,"Queen of Shadows (Throne of Glass, #4)",2017-09-05,2017-09-11,5,"{'Fantasy': 8435, 'Young Adult': 3170, 'Romanc..."
3,Sarah J. Maas,"Heir of Fire (Throne of Glass, #3)",2017-08-24,2017-09-05,5,"{'Fantasy': 9677, 'Young Adult': 3800, 'Romanc..."
1,Alex Flinn,A Kiss in Time,2017-08-16,2017-08-21,2,"{'Fantasy': 657, 'Young Adult': 637, 'Romance'..."
4,Sarah J. Maas,A Court of Wings and Ruin (A Court of Thorns a...,2017-08-02,2017-08-16,5,"{'Fantasy': 7960, 'Romance': 2365, 'Young Adul..."


Unnamed: 0,author,title,seen,score,genre_and_votes
0,Renée Ahdieh,The Wrath and the Dawn (The Wrath and the Dawn...,False,0.18305,"{'Fantasy': 5942, 'Young Adult': 3448, 'Romanc..."
1,Leigh Bardugo,"Six of Crows (Six of Crows, #1)",False,0.173211,"{'Fantasy': 15409, 'Young Adult': 6520}"
2,Renée Ahdieh,"The Rose & the Dagger (The Wrath & the Dawn, #2)",False,0.15109,"{'Fantasy': 3082, 'Young Adult': 1686, 'Romanc..."
3,Sarah J. Maas,A Court of Mist and Fury (A Court of Thorns an...,False,0.15077,"{'Fantasy': 10186, 'Romance': 3346, 'Young Adu..."
4,Sarah J. Maas,A Court of Thorns and Roses (A Court of Thorns...,False,0.145596,"{'Fantasy': 14326, 'Young Adult': 4662, 'Roman..."


### Rating

In [125]:
rating = predictor.compute_rating()
display(rating)
assert abs(rating[0] - 0.9759) < 0.0001, "Must match theory"

  .apply(lambda x: self._compute_ndcg(x["rating_test"], x["score"], k=5))


(0.9759467097921087, 39.06332421706986)

### I2I (items to items)

In [126]:
item_id = items['item_id'].sample().iat[0]
display(items[items["item_id"] == item_id][["author", "title", "genre_and_votes"]])
similar_items = predictor.recommend_by_item(item_id)
similar_items = similar_items.merge(items[["item_id", "author", "title", "genre_and_votes"]], on="item_id")
display(similar_items[["author", "title", "score", "genre_and_votes"]])

Unnamed: 0,author,title,genre_and_votes
1485095,"Aristotle, Joe Sachs",Metaphysics,"{'Philosophy': 1165, 'Classics': 157, 'Nonfict..."


Unnamed: 0,author,title,score,genre_and_votes
0,"Aristotle, Joe Sachs",Metaphysics,1.0,"{'Philosophy': 1165, 'Classics': 157, 'Nonfict..."
1,"Arthur Schopenhauer, R.J. Hollingdale",Essays and Aphorisms,0.972341,"{'Philosophy': 942, 'Nonfiction': 117, 'Writin..."
2,"Baruch Spinoza, Edwin M. Curley, Stuart Hampsh...",Ethics,0.971739,"{'Philosophy': 1833, 'Nonfiction': 301, 'Class..."
3,"Friedrich Nietzsche, Michael Tanner, Shaun Whi...",The Birth of Tragedy,0.959746,"{'Philosophy': 1393, 'Nonfiction': 273, 'Class..."
4,"Georg Wilhelm Friedrich Hegel, A.V. Miller, Jo...",Phenomenology of Spirit,0.958013,"{'Philosophy': 1494, 'Nonfiction': 158, 'Class..."
5,"Ludwig Wittgenstein, David Pears, Brian McGuin...",Tractatus Logico-Philosophicus,0.957844,"{'Philosophy': 1868, 'Nonfiction': 253, 'Philo..."
6,"Franz Kafka, Mark Harman",The Castle,0.95621,"{'Fiction': 1237, 'Classics': 972, 'Literature..."
7,"Søren Kierkegaard, Alastair Hannay",Fear and Trembling,0.954933,"{'Philosophy': 2357, 'Nonfiction': 380, 'Relig..."
8,"Joris-Karl Huysmans, Robert Baldick, Patrick M...",Against Nature,0.951848,"{'Fiction': 388, 'Classics': 275, 'Cultural-Fr..."
9,"Immanuel Kant, Paul Guyer, Allen W. Wood",Critique of Pure Reason,0.951631,"{'Philosophy': 3229, 'Nonfiction': 473, 'Class..."


# Базовые подходы: контентные рекомендации

In [127]:
items["genre_and_votes"] = items["genre_and_votes"].apply(eval) 

In [128]:
from content_recommendations import get_genres
genres = get_genres(items)
popular_5th = genres.sort_values("votes", ascending=False).head(5)
display(popular_5th)
assert popular_5th.iloc[[4]]["name"].iat[0] == "Romance", "Must match theory"

Unnamed: 0_level_0,name,votes
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1
25,Fantasy,6850060
1,Fiction,6406256
38,Classics,3414934
18,Young Adult,3296951
34,Romance,2422614


In [129]:
from content_recommendations import get_item2genre_matrix
items = items.sort_values(by="item_id_enc")
all_items_genres_csr = get_item2genre_matrix(genres, items)

In [130]:
from content_recommendations import get_genres, get_item2genre_matrix
# получим матрицу жанров для конкретного пользователя
user_id = 1000010
user_events = events_train.query("user_idx == @user_id")[["item_id", "rating"]]
user_items = items[items["item_id"].isin(user_events["item_id"])]
user_items_genres_csr = get_item2genre_matrix(genres, user_items)
display(user_items_genres_csr)

# Сколько получилось существующих элементов в user_items_genres_csr для выбранного пользователя?
assert user_items_genres_csr.nnz == 149, "Must match theory"

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 149 stored elements and shape (22, 815)>

На практике часто пользователь явно указывает предпочтения в своём профиле. У нас таких данных нет, поэтому предпочтения пользователя по жанрам вычислим автоматически на основе его истории поведения. 

In [131]:
# вычислим склонность пользователя к жанрам как среднее взвешенное значение популяции на его оценки книг.

# преобразуем пользовательские оценки из списка в вектор-столбец
user_ratings = user_events["rating"].to_numpy() / 5
user_ratings = np.expand_dims(user_ratings, axis=1)

user_items_genres_weighted = user_items_genres_csr.multiply(user_ratings)

user_genres_scores = np.asarray(user_items_genres_weighted.mean(axis=0)) 

# выведем список жанров, которые предпочитает пользователь
user_genres = genres.copy()
user_genres["score"] = np.ravel(user_genres_scores)
user_genres_filtered = user_genres[user_genres["score"] > 0].sort_values(by=["score"], ascending=False)

user_genres_filtered.head(5) 

Unnamed: 0_level_0,name,votes,score
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Fiction,6406256,0.185241
38,Classics,3414934,0.103879
25,Fantasy,6850060,0.072447
5,Nonfiction,1737406,0.050865
24,Science Fiction,1218917,0.04092


In [132]:
from sklearn.metrics.pairwise import cosine_similarity

# вычисляем сходство между вектором пользователя и векторами по книгам
similarity_scores = cosine_similarity(all_items_genres_csr, user_genres_scores)

# преобразуем в одномерный массив
similarity_scores = similarity_scores.flatten()

# получаем индексы top-k (по убыванию значений), по сути, индексы книг (encoded)
k = 5
top_k_indices = np.argsort(similarity_scores)[-k:][::-1]
selected_items = items[items["item_id_enc"].isin(top_k_indices)]

with pd.option_context("max_colwidth", 100):
   display(selected_items[["author", "title", "genre_and_votes"]]) 

Unnamed: 0,author,title,genre_and_votes
80465,G.K. Chesterton,The Napoleon of Notting Hill,"{'Fiction': 166, 'Classics': 88, 'Fantasy': 44, 'Humor': 22, 'Literature': 20}"
1168335,Ray Bradbury,"Dandelion Wine (Green Town, #1)","{'Fiction': 1438, 'Classics': 914, 'Science Fiction': 529, 'Fantasy': 456, 'Young Adult': 212}"
393210,"G.K. Chesterton, Jonathan Lethem",The Man Who Was Thursday: A Nightmare,"{'Fiction': 1257, 'Classics': 929, 'Mystery': 469, 'Fantasy': 293, 'Philosophy': 156, 'Literatur..."
2244467,Samuel Butler,"Erewhon (Erewhon , #1)","{'Fiction': 162, 'Classics': 139, 'Science Fiction': 60, 'Fantasy': 55}"
39408,"Paulo Coelho, Alan R. Clarke, James Noel Smith",The Alchemist,"{'Fiction': 14023, 'Classics': 5787, 'Fantasy': 3289, 'Philosophy': 2759}"


### TODO: Факультативное задание
- Получите по алгоритму выше рекомендации для нескольких пользователей, просмотрите их на экране. Подумайте, насколько релевантны и интересны полученные рекомендации пользователям.
- Попробуйте использовать другую меру сходства для получения рекомендаций, например, евклидово расстояние. Проанализируйте, отличаются ли рекомендации от предыдущих. Подумайте почему.
- Задайте собственные предпочтения для наиболее популярных жанров. Посмотрите рекомендации для себя. Прочитали ли бы вы рекомендованные книги?

# Базовые подходы: валидация

## Рассчет ALS для всех пользователей

Alternating Least Squares — это метод, который используется для разложения матриц в рекомендациях и коллаборативной фильтрации

In [133]:
from als_model import ALSPredictor
predictor = ALSPredictor(data_encoder)
als_recommendations = predictor.get_all_recommendations()

Loading ALS recommendations from d:\Work\ya.practicum\ml\sprint-4\mle-recsys-start\data\als_recommendations.parquet


## Рассчет метрик

In [134]:
from validation import compute_cls_metrics, process_events_recs_for_binary_metrics

_SIZE = 5

events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=_SIZE)

metrics = compute_cls_metrics(events_recs_for_binary_metrics)
assert abs(metrics[1] - 0.014) < 0.001, "Must match theory"
display(metrics)

Common users: 123223


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


(0.007581376853347184, 0.014121568795222568)

In [135]:
from validation import compute_cls_metrics, process_events_recs_for_binary_metrics

_SIZE = 10

events_recs_for_binary_metrics = process_events_recs_for_binary_metrics(
  events_train,
    events_test, 
    als_recommendations, 
    top_k=_SIZE)

compute_cls_metrics(events_recs_for_binary_metrics)

Common users: 123223


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


(0.008732947582837622, 0.03130238527136974)

## Специфичные метрики

### Покрытие (coverage)

In [136]:
# Compute coverage
users_w_recommendations = als_recommendations["user_id"].nunique()
users_coverage = users_w_recommendations / events["user_id"].nunique()
display(f"Users coverage: {users_coverage:.2%}")

items_in_recommendations = als_recommendations["item_id"].nunique()
items_coverage = items_in_recommendations / items["item_id"].nunique()
assert abs(items_coverage - 0.09) < 0.01, "Must match theory"
# Критично, чтобы покрытие не было чересчур малым — речь о долях процента. 
display(f"Items coverage: {items_coverage:.2%}")

'Users coverage: 100.00%'

'Items coverage: 9.36%'

### Новизна (novelty)

Это значение можно интерпретировать так: для топ-5 рекомендаций в среднем пользователю рекомендуется 61% книг, которые он ранее не читал.

In [137]:
# разметим каждую рекомендацию признаком read
events_train["read"] = True
als_recommendations_merged = als_recommendations.copy().merge(events_train, on=["user_id", "item_id"], how="left")
als_recommendations_merged["read"] = als_recommendations_merged["read"].fillna(False).astype("bool")

# проставим ранги
als_recommendations_merged = als_recommendations_merged.sort_values(by=["user_id", "score"], ascending=[True, False])
als_recommendations_merged["rank"] = als_recommendations_merged.groupby("user_id").cumcount() + 1

# посчитаем novelty по пользователям
novelty_5 = (1 - als_recommendations_merged.query("rank <= 5").groupby("user_id")["read"].mean()).mean()
display(f"Novelty@5: {novelty_5:.2}")
assert abs(novelty_5 - 0.61) < 0.01, "Must match theory"

  als_recommendations_merged["read"] = als_recommendations_merged["read"].fillna(False).astype("bool")


'Novelty@5: 0.61'

# Двухстадийный подход: метрики

## Метрики

In [138]:
split_date_for_labels = pd.to_datetime("2017-09-15").date()

split_date_for_labels_idx = events_test["started_at"] < split_date_for_labels
events_labels = events_test[split_date_for_labels_idx].copy()
events_test_2 = events_test[~split_date_for_labels_idx].copy()
events_labels_unique_users = events_labels["user_id"].nunique()
assert events_labels_unique_users == 99849, "Must match theory"
display(f"Unique users in labels: {events_labels_unique_users}")

'Unique users in labels: 99849'

In [139]:
als_recommendations = pd.read_parquet("data/candidates/training/als_recommendations.parquet")
content_recommendations = pd.read_parquet("data/candidates/training/content_recommendations.parquet")

candidates = pd.merge(
    als_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer")
assert candidates.shape[0] == 82993094, "Must match theory"
candidates.shape

(82993094, 4)

In [140]:
# добавляем таргет к кандидатам со значением:
# — 1 для тех item_id, которые пользователь прочитал
# — 0, для всех остальных 

events_labels["target"] = 1
display(candidates.head(2))
display(events_labels.head(2))
candidates_w_target = candidates.merge(events_labels[["user_idx", "item_id", "target"]].rename(columns={"user_idx": "user_id"}), on=["user_id", "item_id"], how="left")
candidates_w_target["target"] = candidates_w_target["target"].fillna(0).astype("int")

# в кандидатах оставляем только тех пользователей, у которых есть хотя бы один положительный таргет
candidates_to_sample = candidates_w_target.groupby("user_id").filter(lambda x: x["target"].sum() > 0)

# для каждого пользователя оставляем только 4 негативных примера
negatives_per_user = 4
candidates_for_train = pd.concat([
    candidates_to_sample.query("target == 1"),
    candidates_to_sample.query("target == 0") \
        .groupby("user_id") \
        .apply(lambda x: x.sample(negatives_per_user, random_state=0))
    ]) 
assert candidates_for_train.shape[0] == 213708, "Must match theory"

Unnamed: 0,user_id,item_id,als_score,cnt_score
0,1000000,1,0.756692,0.933434
1,1000000,2,0.792929,0.925806


Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,user_idx,user_id_enc,item_id_enc,gt,target
84,7504b2aee1ecb5b2872d3da381c6c91e,18467802,2017-09-01,2017-09-22,True,1,False,1196635,196635,36588,True,1
257,704eb93a316aff687a93d5215882eb21,10799,2017-08-06,2017-10-14,True,3,False,1188739,188739,1262,True,1


  .apply(lambda x: x.sample(negatives_per_user, random_state=0))


## Модель

In [141]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score']
target = 'target'

# Create the Pool object
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0
)

# тренируем модель
cb_model.fit(train_data) 

0:	learn: 0.6490476	total: 37.6ms	remaining: 37.6s
100:	learn: 0.5023340	total: 2.04s	remaining: 18.2s
200:	learn: 0.5015518	total: 4.07s	remaining: 16.2s
300:	learn: 0.5008895	total: 6.18s	remaining: 14.3s
400:	learn: 0.5003160	total: 8.33s	remaining: 12.4s
500:	learn: 0.4997954	total: 10.5s	remaining: 10.5s
600:	learn: 0.4993583	total: 12.8s	remaining: 8.52s
700:	learn: 0.4989667	total: 15.3s	remaining: 6.53s
800:	learn: 0.4985483	total: 17.8s	remaining: 4.43s
900:	learn: 0.4982135	total: 20.3s	remaining: 2.23s
999:	learn: 0.4978769	total: 22.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1f514ed7ca0>

In [142]:
# загружаем рекомендации от двух базовых генераторов
als_recommendations_2 = pd.read_parquet("./data/candidates/inference/als_recommendations.parquet")
content_recommendations_2 = pd.read_parquet("./data/candidates/inference/content_recommendations.parquet")

candidates_to_rank = pd.merge(als_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "als_score"}),
    content_recommendations_2[["user_id", "item_id", "score"]].rename(columns={"score": "cnt_score"}),
    on=["user_id", "item_id"],
    how="outer")

# оставляем только тех пользователей, что есть в тестовой выборке, для экономии ресурсов
candidates_to_rank = candidates_to_rank[candidates_to_rank["user_id"].isin(events_test_2["user_idx"].drop_duplicates())]
size = len(candidates_to_rank)
assert size == 14517152, "Must match theory"
print(size)

14517152


In [143]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставляем rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user")
assert final_recommendations.shape[0] == 7519400, "Must match theory"
final_recommendations.shape

(7519400, 6)

## Валидация

In [144]:
from validation import compute_cls_metrics, process_events_recs_for_binary_metrics

events_inference = pd.concat([events_train, events_labels])

cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
    events_inference,
    events_test_2.drop(columns=["user_id"]).rename(columns={"user_idx": "user_id"}),
    final_recommendations.rename(columns={"cb_score": "score"}), 
    top_k=5)

cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

assert abs(cb_recall_5 - 0.016) < 0.001, "Must match theory"

print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}") 

Common users: 75194
precision: 0.007, recall: 0.016


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


# Двухстадийный подход: признаки

## Признаки объектов

In [145]:
items["age"] = 2018 - items["publication_year"]
invalid_age_idx = items["age"] < 0
items.loc[invalid_age_idx, "age"] = np.nan
items["age"] = items["age"].astype("float")

candidates_for_train = candidates_for_train.merge(items[["item_id", "age", "average_rating"]], on="item_id", how="left")
candidates_to_rank = candidates_to_rank.merge(items[["item_id", "age", "average_rating"]], on="item_id", how="left")
median_age = candidates_to_rank["age"].median()
assert abs(median_age - 7.0) < 0.01, "Must match theory"
display("Median age of items in candidates_to_rank:", median_age)


'Median age of items in candidates_to_rank:'

7.0

## Признаки пользователей

In [146]:
def get_user_features(events):
    """ считает пользовательские признаки """
    
    user_features = events.groupby("user_id").agg(
        reading_years=("started_at", lambda x: (x.max()-x.min()).days/365.25),
        books_read=("rating", "count"),
        rating_avg=("rating", "mean"),
        rating_std=("rating", "std"))
    
    user_features["books_per_year"] = user_features["books_read"] / user_features["reading_years"]
    
    return user_features

user_features_for_train = get_user_features(events_train[["user_idx", "rating", "started_at"]].rename(columns={"user_idx": "user_id"}))
candidates_for_train = candidates_for_train.merge(user_features_for_train, on="user_id", how="left")
  
# оставим только тех пользователей, что есть в тесте, для экономии ресурсов
events_inference = pd.concat([events_train, events_labels])
events_inference = events_inference[events_inference["user_id"].isin(events_test["user_id"].drop_duplicates())]

user_features_for_ranking = get_user_features(events_inference[["user_idx", "rating", "started_at"]].rename(columns={"user_idx": "user_id"}))
candidates_to_rank = candidates_to_rank.merge(user_features_for_ranking, on="user_id", how="left")
books_read_median = candidates_for_train["books_read"].median()
assert abs(books_read_median - 32.0) < 0.01, "Must match theory"
display("Median books read by users in candidates_for_train:", books_read_median)

'Median books read by users in candidates_for_train:'

32.0

## Парные признаки

In [147]:
# определяем индексы топ-10 жанров и всех остальных
genres_top_k = 10
genres_top_idx = genres.sort_values("votes", ascending=False).head(genres_top_k).index
genres_others_idx = list(set(genres.index) - set(genres_top_idx))

genres_top_columns = [f"genre_{id}" for id in genres_top_idx]
genres_others_column = "genre_others"
genre_columns = [*genres_top_columns, genres_others_column]

# составляем таблицу принадлежности книг к жанрам
item_genres = (
    pd.concat([
        # топ жанров
        pd.DataFrame(all_items_genres_csr[:, genres_top_idx].toarray(), columns=genres_top_columns),
        # все остальные жанры
        pd.DataFrame(all_items_genres_csr[:, genres_others_idx].sum(axis=1), columns=[genres_others_column])
        ],
        axis=1)
    .reset_index()
    .rename(columns={"index": "item_id_enc"})
)

# объединяем информацию принадлежности книг к жанрам с основной информацией о книгах
items = items.merge(item_genres, on="item_id_enc", how="left")

def get_user_genres(events, items, item_genre_columns):
    user_genres = (
        events
        .merge(items[["item_id"] + item_genre_columns], on="item_id", how="left")
        .groupby("user_id")[item_genre_columns].mean()
    )
    return user_genres
    
user_genres_for_train = get_user_genres(events_train.drop(columns=["user_id"]).rename(columns={"user_idx": "user_id"}), items, genre_columns)
candidates_for_train = candidates_for_train.merge(user_genres_for_train, on="user_id", how="left")

user_genres_for_ranking = get_user_genres(events_inference.drop(columns=["user_id"]).rename(columns={"user_idx": "user_id"}), items, genre_columns)
candidates_to_rank = candidates_to_rank.merge(user_genres_for_ranking, on="user_id", how="left")


In [148]:
romance_ix = genres.query("name == 'Romance'").index[0]
romance_column = f"genre_{romance_ix}"
genre_median = candidates_for_train[romance_column].median()
display(f"Median Romance {romance_column} in candidates_for_train: {genre_median:.2f}")
eps = genre_median / 100 * 5 # 5% of value
assert abs(genre_median - 0.04) < eps, "Must match theory"

'Median Romance genre_34 in candidates_for_train: 0.04'

## Обучение и получение рекомендаций

In [149]:
from catboost import CatBoostClassifier, Pool

# задаём имена колонок признаков и таргета
features = ['als_score', 'cnt_score', 'age', 'average_rating', 'reading_years', 'books_read', 'rating_avg', 'rating_std', 'books_per_year'] + genre_columns
target = 'target'

# создаём Pool
train_data = Pool(
    data=candidates_for_train[features], 
    label=candidates_for_train[target])

# инициализируем модель CatBoostClassifier
cb_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    random_seed=0,
)

# тренируем модель
cb_model.fit(train_data) 

0:	learn: 0.6415986	total: 41.5ms	remaining: 41.4s
100:	learn: 0.4528693	total: 3.1s	remaining: 27.6s
200:	learn: 0.4435405	total: 5.77s	remaining: 22.9s
300:	learn: 0.4369448	total: 8.55s	remaining: 19.9s
400:	learn: 0.4316360	total: 11.3s	remaining: 16.9s
500:	learn: 0.4270528	total: 14.1s	remaining: 14s
600:	learn: 0.4230633	total: 17.4s	remaining: 11.5s
700:	learn: 0.4192671	total: 20.5s	remaining: 8.75s
800:	learn: 0.4157973	total: 23.7s	remaining: 5.89s
900:	learn: 0.4125411	total: 26.8s	remaining: 2.95s
999:	learn: 0.4093389	total: 30s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1f632cba380>

In [150]:
inference_data = Pool(data=candidates_to_rank[features])
predictions = cb_model.predict_proba(inference_data)

candidates_to_rank["cb_score"] = predictions[:, 1]

# для каждого пользователя проставим rank, начиная с 1 — это максимальный cb_score
candidates_to_rank = candidates_to_rank.sort_values(["user_id", "cb_score"], ascending=[True, False])
candidates_to_rank["rank"] = candidates_to_rank.groupby("user_id").cumcount() + 1

max_recommendations_per_user = 100
final_recommendations = candidates_to_rank.query("rank <= @max_recommendations_per_user")

In [152]:
import os
if not os.path.exists("data/final_recommendations_feat.parquet"):
    final_recommendations.to_parquet("data/final_recommendations_feat.parquet")

total_users = final_recommendations["user_id"].nunique()
display(f"Total users in final recommendations: {total_users}")
assert total_users == 75194, "Must match theory"

'Total users in final recommendations: 75194'

## Валидация

In [None]:
from validation import compute_cls_metrics, process_events_recs_for_binary_metrics


def main():
    _test = events_test_2.drop(columns=["user_id"]).rename(columns={"user_idx": "user_id"})

    display(events_train.head(2))

    # для экономии ресурсов оставим события только тех пользователей, 
    # для которых следует оценить рекомендации
    events_inference = pd.concat([
        events_train.drop(columns=["user_id"]).rename(columns={"user_idx": "user_id"}),
        events_labels.drop(columns=["user_id"]).rename(columns={"user_idx": "user_id"})
        ])
    events_inference = events_inference[events_inference["user_id"].isin(_test["user_id"].drop_duplicates())]

    display(events_inference.head(2))

    _SIZE = 5

    cb_events_recs_for_binary_metrics_5 = process_events_recs_for_binary_metrics(
        events_inference,
        _test,
        final_recommendations.rename(columns={"cb_score": "score"}), 
        top_k=_SIZE)

    cb_precision_5, cb_recall_5 = compute_cls_metrics(cb_events_recs_for_binary_metrics_5)

    print(f"precision: {cb_precision_5:.3f}, recall: {cb_recall_5:.3f}")
    assert abs(cb_recall_5 - 0.035) < 0.001, "Must match theory" # Actually it must be 0.030

main()

Unnamed: 0,user_id,item_id,started_at,read_at,is_read,rating,is_reviewed,user_idx,user_id_enc,item_id_enc,read
0,8842281e1d1347389f2ab93d60773d4d,22034,2015-07-12,2015-07-17,True,5,False,1229132,229132,2460,True
1,8842281e1d1347389f2ab93d60773d4d,22318578,2015-06-07,2015-08-09,True,5,True,1229132,229132,38691,True


Unnamed: 0,item_id,started_at,read_at,is_read,rating,is_reviewed,user_id,user_id_enc,item_id_enc,read,gt,target
278,3227063,2015-12-27,2016-01-10,True,4,True,1001879,1879,21498,True,,
279,6483360,2016-02-03,2016-08-18,True,4,False,1001879,1879,24085,True,,


Common users: 75194


  events_recs_common["gt"] = events_recs_common["gt"].fillna(False)


precision: 0.013, recall: 0.035


In [159]:
feature_importance = pd.DataFrame(cb_model.get_feature_importance(),  index=features, columns=["fi"])
feature_importance = feature_importance.sort_values("fi", ascending=False)

print(feature_importance)
assert feature_importance.iloc[0].name == "als_score", "Must match theory"

                       fi
als_score       27.860713
age             22.671785
average_rating  16.662051
books_read       3.403337
cnt_score        3.238405
reading_years    2.473497
genre_1          2.421712
genre_18         2.409989
genre_others     2.252007
genre_25         2.219547
genre_34         1.951275
genre_38         1.876471
books_per_year   1.834685
genre_33         1.417192
rating_avg       1.355082
genre_20         1.354999
genre_24         1.346100
genre_16         1.190132
genre_5          1.110691
rating_std       0.950330
