In [1]:
%%capture
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from recsys_pipeliner.recommendations.transformer import (
    SimilarityTransformer,
    UserItemMatrixTransformer,
)
from recsys_pipeliner.recommenders import (
    ItemBasedCFRecommender,
)

In [3]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv("../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(3)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00024,0.8
1,U00001,I00013,0.6
2,U00001,I00005,1.0


In [4]:
# encode the item ids
item_encoder = LabelEncoder()
user_encoder = LabelEncoder()

user_item_ratings["item_id"] = item_encoder.fit_transform(
    user_item_ratings["item_id"]
)
user_item_ratings["user_id"] = user_encoder.fit_transform(
    user_item_ratings["user_id"]
)

unique_users = pd.Series(user_encoder.classes_)
unique_items = pd.Series(item_encoder.classes_)

print(unique_users.shape, unique_items.shape)

user_item_ratings.head(3)


(12,) (24,)


Unnamed: 0,user_id,item_id,rating
0,0,23,0.8
1,0,12,0.6
2,0,4,1.0


In [5]:
user_item_matrix_transformer = UserItemMatrixTransformer()

user_item_matrix = user_item_matrix_transformer.transform(
    user_item_ratings.to_numpy(),
)

# check ratings from matrix are correct
users = user_item_ratings["user_id"].to_numpy().astype(int)
items = user_item_ratings["item_id"].to_numpy().astype(int)
ratings = user_item_ratings["rating"].to_numpy().astype(np.float32)
for user, item, rating in zip(users, items, ratings):
    assert user_item_matrix[user, item] == rating

user_item_matrix.shape

(12, 24)

In [6]:
item_similarity_matrix_transformer = SimilarityTransformer()
item_similarity_matrix = item_similarity_matrix_transformer.transform(
    user_item_matrix.T
)

user_similarity_matrix_transformer = SimilarityTransformer()
user_similarity_matrix = user_similarity_matrix_transformer.transform(
    user_item_matrix
)

item_similarity_matrix.shape, user_similarity_matrix.shape

((24, 24), (12, 12))

### Predict ratings with ItemBasedCFRecommender

In [17]:
user_ids = user_encoder.transform(["U00003", "U00004", "U00005"])
item_ids = item_encoder.transform(["I00010", "I00011", "I00012"])

X = np.stack([user_ids, item_ids], axis=1)

print("X", X)

n=5
k=10

for u, i in X:
    print("u", u)
    print("i", i)

    idx, users_rated_items, users_ratings = sp.sparse.find(user_item_matrix[u, :])

    print("users_rated_items", users_rated_items)
    print("users_ratings", users_ratings)

    # get the similarities to item_id
    item_similarities = item_similarity_matrix[:, users_rated_items][i].toarray().astype(np.float32).round(6)
    print("item_similarities", item_similarities)

    # sort by similarity (desc) and get top k
    top_k_mask = np.argsort(1 - item_similarities)[1:k+1]
    print("top_k_mask", top_k_mask)

    top_k_users_rated_items = users_rated_items[top_k_mask]
    top_k_user_ratings = users_ratings[top_k_mask]
    top_k_rated_item_similarities = item_similarities[top_k_mask]

    print("top_k_users_rated_items", top_k_users_rated_items)
    print("top_k_user_ratings", top_k_user_ratings)
    print("top_k_rated_item_similarities", top_k_rated_item_similarities)

    # weighted average rating
    predicted_score = np.average(top_k_user_ratings, axis=0, weights=top_k_rated_item_similarities).astype(np.float32).round(6)
    print("predicted_score", predicted_score)

X [[ 2  9]
 [ 3 10]
 [ 4 11]]
u 2
i 9
users_rated_items [ 2  3  5 10 13 15 18 23]
users_ratings [0.82 1.   0.82 0.82 0.82 1.   0.62 0.62]
item_similarities [0.345477 0.       0.279575 0.285752 0.277952 0.260534 0.110629 0.129859]
top_k_mask [3 2 4 5 7 6 1]
top_k_users_rated_items [10  5 13 15 23 18  3]
top_k_user_ratings [0.82 0.82 0.82 1.   0.62 0.62 1.  ]
top_k_rated_item_similarities [0.285752 0.279575 0.277952 0.260534 0.129859 0.110629 0.      ]
predicted_score 0.819106
u 3
i 10
users_rated_items [ 0  3  7 10 12 14 15 23]
users_ratings [0.63 0.43 0.63 0.43 0.63 0.43 1.   0.63]
item_similarities [0.174086 0.642051 0.116487 1.       0.284687 0.103483 0.61923  0.427682]
top_k_mask [1 6 7 4 0 2 5]
top_k_users_rated_items [ 3 15 23 12  0  7 14]
top_k_user_ratings [0.43 1.   0.63 0.63 0.63 0.63 0.43]
top_k_rated_item_similarities [0.642051 0.61923  0.427682 0.284687 0.174086 0.116487 0.103483]
predicted_score 0.663791
u 4
i 11
users_rated_items [ 1  9 13 14 15 19 20 21]
users_ratings [0

In [None]:
# item_based_recommender = ItemBasedCFRecommender(k=5, n=10)
# item_based_recommender.fit(user_item_matrix)

# X = np.vstack([np.repeat(user_ids[0], users_unrated_items.shape[0]).T, users_unrated_items.T])

# predictions = item_based_recommender.predict(X)

# predictions