In [1]:
%%capture
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from recsys_pipeliner.recommendations.transformer import (
    SimilarityTransformer,
    UserItemMatrixTransformer,
)
from recsys_pipeliner.recommenders import (
    ItemBasedCFRecommender,
)

In [3]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv("../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(3)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00024,0.8
1,U00001,I00013,0.6
2,U00001,I00005,1.0


In [4]:
# encode the item ids
item_encoder = LabelEncoder()
user_encoder = LabelEncoder()

user_item_ratings["item_id"] = item_encoder.fit_transform(
    user_item_ratings["item_id"]
)
user_item_ratings["user_id"] = user_encoder.fit_transform(
    user_item_ratings["user_id"]
)

unique_users = pd.Series(user_encoder.classes_)
unique_items = pd.Series(item_encoder.classes_)

print(unique_users.shape, unique_items.shape)

user_item_ratings.head(3)


(12,) (24,)


Unnamed: 0,user_id,item_id,rating
0,0,23,0.8
1,0,12,0.6
2,0,4,1.0


In [5]:
user_item_matrix_transformer = UserItemMatrixTransformer()

user_item_matrix = user_item_matrix_transformer.transform(
    user_item_ratings.to_numpy(),
)

# check ratings from matrix are correct
users = user_item_ratings["user_id"].to_numpy().astype(int)
items = user_item_ratings["item_id"].to_numpy().astype(int)
ratings = user_item_ratings["rating"].to_numpy().astype(np.float32)
for user, item, rating in zip(users, items, ratings):
    assert user_item_matrix[user, item] == rating

user_item_matrix.shape

(12, 24)

In [6]:
item_similarity_matrix_transformer = SimilarityTransformer()
item_similarity_matrix = item_similarity_matrix_transformer.transform(
    user_item_matrix.T
)

user_similarity_matrix_transformer = SimilarityTransformer()
user_similarity_matrix = user_similarity_matrix_transformer.transform(
    user_item_matrix
)

item_similarity_matrix.shape, user_similarity_matrix.shape

((24, 24), (12, 12))

### User-based collaborative filtering (with optional item_id)

In [None]:
# with item_id

# 1. get all users (that have rated item i)
# 2. rank by similarity to user u
# 3. filter users (by k or similarity threshold)
# 4a. for rating prediction - calculate mean rating of item i (weighted by user similarity)
# 4b. for item recommendation -get all items that the filtered users have rated
# 5. exclude items that user u has already rated
# 6. rank by mean rating (weighted by user similarity)
# 7. return top n items

In [38]:
user_id = 5
item_id = 9
k = 10

# sense check

display(user_item_ratings[user_item_ratings["user_id"] == user_id])
display(user_item_ratings[user_item_ratings["item_id"] == item_id])

Unnamed: 0,user_id,item_id,rating
40,5,12,0.85
41,5,9,0.25
42,5,7,0.65
43,5,13,0.25
44,5,18,0.85
45,5,19,0.85
46,5,22,0.65
47,5,11,0.65


Unnamed: 0,user_id,item_id,rating
38,4,9,0.44
41,5,9,0.25
71,8,9,0.28
73,9,9,0.89


In [44]:

# 1. get all users (that have rated item i)

_, users, ratings = sp.sparse.find(user_item_matrix[:, item_id])

user_mask = users != user_id

users = users[user_mask]
ratings = ratings[user_mask]

users, ratings

(array([4, 8, 9]), array([0.44, 0.28, 0.89], dtype=float32))

In [45]:

user_similarity_to_u = user_similarity_matrix[user_id, users].toarray()

user_sorter = np.argsort(1 - user_similarity_to_u, kind="stable")[:k]

similar_users = users[user_sorter]
user_similarities = user_similarity_to_u[user_sorter]
similar_users_ratings = ratings[user_sorter]

print(f"similar_users: {similar_users}")
print(f"user_similarities: {user_similarities}")
print(f"similar_users_ratings: {similar_users_ratings}")

similar_users: [9 4 8]
user_similarities: [0.322963 0.158928 0.155166]
similar_users_ratings: [0.89 0.44 0.28]


In [50]:
# for prediction

predicted_rating = (
    np.average(similar_users_ratings, weights=user_similarities)
    .astype(np.float32)
    .round(6)
)

print(f"predicted_rating: {predicted_rating}")

predicted_rating: 0.6291620135307312


In [51]:
# for recommendation

_, items_rated_by_u, items_ratings_by_u = sp.sparse.find(user_item_matrix[user_id, :])

print(f"items_rated_by_u: {items_rated_by_u}")
print(f"items_ratings_by_u: {items_ratings_by_u}")

_, items, item_ratings = sp.sparse.find(user_item_matrix[similar_users, :])
print(f"items: {items}")
print(f"item_ratings: {item_ratings}")

items_rated_by_u: [ 7  9 11 12 13 18 19 22]
items_ratings_by_u: [0.65 0.25 0.65 0.85 0.25 0.85 0.85 0.65]
items: [ 2  5  9 10 11 16 17 22  1  9 13 14 15 19 20 21  0  1  4  9 12 15 22 23]
item_ratings: [0.69 0.29 0.89 0.49 0.89 0.29 0.29 0.29 0.64 0.44 1.   1.   0.64 0.24
 0.24 0.64 0.88 1.   0.88 0.28 0.28 0.88 0.48 0.88]


In [9]:
# user_ids = user_encoder.transform(["U00003", "U00004", "U00005"])
# item_ids = item_encoder.transform(["I00010", "I00011", "I00012"])

# X = np.stack([user_ids, item_ids], axis=1)

# print("X", X)

# n=5
# k=10

# for u, i in X:
#     print("u", u)
#     print("i", i)

#     idx, users_rated_items, users_ratings = sp.sparse.find(user_item_matrix[u, :])

#     print("users_rated_items", users_rated_items)
#     print("users_ratings", users_ratings)

#     # get the similarities to item_id
#     item_similarities = item_similarity_matrix[:, users_rated_items][i].toarray().astype(np.float32).round(6)
#     print("item_similarities", item_similarities)

#     # sort by similarity (desc) and get top k
#     top_k_mask = np.argsort(1 - item_similarities)[1:k+1]
#     print("top_k_mask", top_k_mask)

#     top_k_users_rated_items = users_rated_items[top_k_mask]
#     top_k_user_ratings = users_ratings[top_k_mask]
#     top_k_rated_item_similarities = item_similarities[top_k_mask]

#     print("top_k_users_rated_items", top_k_users_rated_items)
#     print("top_k_user_ratings", top_k_user_ratings)
#     print("top_k_rated_item_similarities", top_k_rated_item_similarities)

#     # weighted average rating
#     predicted_rating = np.average(top_k_user_ratings, axis=0, weights=top_k_rated_item_similarities).astype(np.float32).round(6)
#     print("predicted_rating", predicted_rating)