In [57]:
import pandas as pd
import implicit


import numpy as np

import scipy.sparse as sp
import random
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

## Datasets

MovieLens dataset: https://www.kaggle.com/shubhammehta21/movie-lens-small-latest-dataset

Наверное, самый часто используемый датасет.


In [58]:
path = "movielens_small"
rating_threshold = 4.0

df = pd.read_csv(f"{path}/ratings.csv")
df = df[df["rating"] >= rating_threshold]
movie_ds = pd.read_csv(f"{path}/movies.csv")

users = df["userId"]
movies = df["movieId"]
user_item = sp.coo_matrix((np.ones_like(users), (users, movies)))
user_item_t_csr = user_item.T.tocsr()
user_item_csr = user_item.tocsr()

## Код для анализа результатов

In [59]:
get_similars = lambda item_id, model : [movie_ds[movie_ds["movieId"] == x[0]]["title"].to_string() for x in model.similar_items(item_id)]
id2name = lambda ids : [movie_ds[movie_ds["movieId"] == i]["title"].to_string() for i in ids] 
get_recommendations = lambda user_ids, model : [[x for x, _ in model.recommend(user_id, user_item_csr, 10, filter_already_liked_items=False)] for user_id in user_ids]
get_user_relatives = lambda user_ids : [set(df[df["userId"] == user_id]["movieId"].to_list()) for user_id in user_ids]


In [60]:
def compute_metrics(recs, rels):
    # Compute metrics
    map_at_k = 0.
    ndcg_at_k = 0.
    k = None
    for rec, rel in zip(recs, rels):
        if k is None:
            k = len(rec)
        ap_at_k = 0.
        ctn = 0
        dcg_at_k = 0
        idcg_at_k = 0
        for j in range(k):
            if rec[j] in rel:
                ctn += 1
                ap_at_k += ctn / (j + 1)
                dcg_at_k += 1 / np.log(j+2)
            if len(rel) > j:
                idcg_at_k += 1 / np.log(j+2)
        ap_at_k /= min(k, max(len(rel), 1))
        ndcg_at_k += dcg_at_k / len(recs) / idcg_at_k
        map_at_k += ap_at_k / len(recs)
    return {f"mAP@{k}": map_at_k, f"NDCG@{k}": ndcg_at_k}

In [64]:
def summarize_results(metrics, target_item, similars, rec):
    print("METRICS")
    for k in metrics:
        print(f"{k} :: {metrics[k]}")
    print()
    print()
    print("SIMILARS")
    print(f"TARGET: {target_item}")
    for i, similar in enumerate(similars):
        print(f"{i + 1}. {similar}")
    
    print()
    print("RECOMENDATIONS")
    for i, r in enumerate(rec):
        print(f"{i + 1}. {r}")

## Models

In [65]:

model = implicit.bpr.BayesianPersonalizedRanking(factors=64, iterations=1500, learning_rate=1e-3)
model.fit(user_item_t_csr)

similars = get_similars(1, model)
users_to_rec = [4] + random.choices(users.to_list(), k = 256)
recommendations = get_recommendations(users_to_rec, model)
rec_names = id2name(recommendations[0])
tgt_name = id2name([1])[0]
relatives = get_user_relatives(users_to_rec)
metrics = compute_metrics(recommendations, relatives)

summarize_results(metrics, tgt_name, similars, rec_names)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1500.0), HTML(value='')))


METRICS
mAP@10 :: 0.8109536676754556
NDCG@10 :: 0.8727366462163672


SIMILARS
TARGET: 0    Toy Story (1995)
1. 0    Toy Story (1995)
2. 2355    Toy Story 2 (1999)
3. 506    Aladdin (1992)
4. 32    Babe (1995)
5. 513    Pinocchio (1940)
6. 512    Beauty and the Beast (1991)
7. 1757    Bug's Life, A (1998)
8. 511    Snow White and the Seven Dwarfs (1937)
9. 812    Aladdin and the King of Thieves (1996)
10. 322    Lion King, The (1994)

RECOMENDATIONS
1. 987    This Is Spinal Tap (1984)
2. 930    Annie Hall (1977)
3. 1702    Player, The (1992)
4. 1529    Roger & Me (1989)
5. 692    Some Like It Hot (1959)
6. 2145    American Beauty (1999)
7. 984    Heathers (1989)
8. 1960    Election (1999)
9. 1435    Terms of Endearment (1983)
10. 1667    Broadcast News (1987)
