In [12]:
%%capture
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from recsys_pipeliner.recommendations.transformer import (
    SimilarityTransformer,
    UserItemMatrixTransformer,
)
from recsys_pipeliner.recommenders import (
    ItemBasedCFRecommender,
)

In [14]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv("../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(3)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00024,0.8
1,U00001,I00013,0.6
2,U00001,I00005,1.0


In [15]:
# encode the item ids
item_encoder = LabelEncoder()
user_encoder = LabelEncoder()

user_item_ratings["item_id"] = item_encoder.fit_transform(
    user_item_ratings["item_id"]
)
user_item_ratings["user_id"] = user_encoder.fit_transform(
    user_item_ratings["user_id"]
)

unique_users = pd.Series(user_encoder.classes_)
unique_items = pd.Series(item_encoder.classes_)

print(unique_users.shape, unique_items.shape)

user_item_ratings.head(3)


(12,) (24,)


Unnamed: 0,user_id,item_id,rating
0,0,23,0.8
1,0,12,0.6
2,0,4,1.0


In [16]:
user_item_matrix_transformer = UserItemMatrixTransformer()

user_item_matrix = user_item_matrix_transformer.transform(
    user_item_ratings.to_numpy(),
)

# check ratings from matrix are correct
users = user_item_ratings["user_id"].to_numpy().astype(int)
items = user_item_ratings["item_id"].to_numpy().astype(int)
ratings = user_item_ratings["rating"].to_numpy().astype(np.float32)
for user, item, rating in zip(users, items, ratings):
    assert user_item_matrix[user, item] == rating

user_item_matrix.shape

(12, 24)

In [17]:
item_similarity_matrix_transformer = SimilarityTransformer()
item_similarity_matrix = item_similarity_matrix_transformer.transform(
    user_item_matrix.T
)

user_similarity_matrix_transformer = SimilarityTransformer()
user_similarity_matrix = user_similarity_matrix_transformer.transform(
    user_item_matrix
)

item_similarity_matrix.shape, user_similarity_matrix.shape

((24, 24), (12, 12))

### Predict ratings with ItemBasedCFRecommender

In [None]:
user_id = "U00003"
user_ids = user_encoder.transform([user_id])
item_id = "I00010"
item_ids = item_encoder.transform([item_id])
n=5
k=10

_, users_rated_items, users_ratings = sp.sparse.find(user_item_matrix[user_ids, :])

# get the top k rated items
top_k_users_rated_items = np.argsort(1 - users_ratings)[:k]
# and all items not rated by the user
users_unrated_items = np.setdiff1d(np.arange(item_similarity_matrix.shape[0]), top_k_users_rated_items)

print("rated", top_k_users_rated_items)
print("unrated", users_unrated_items)


rated [1 5 2 0 3 4 6 7]
unrated [ 8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
mean_similarity [0.24191025 0.18034162 0.32327137 0.1931685  0.22459275 0.25758125
 0.282926   0.433498   0.12282975 0.247516   0.31360625 0.0620775
 0.15356425 0.15041012 0.3026765  0.40872875]


array(['I00016', 'I00024', 'I00011', 'I00019', 'I00023'], dtype=object)

In [None]:
item_based_recommender = ItemBasedCFRecommender(k=5, n=10)
item_based_recommender.fit(user_item_matrix)

X = np.vstack([np.repeat(user_ids[0], users_unrated_items.shape[0]).T, users_unrated_items.T])

predictions = item_based_recommender.predict(X)

predictions

I00009 0.820002
I00010 0.836961
I00011 0.891748
I00012 0.721764
I00013 0.849971
I00014 0.817382
I00015 0.761754
I00016 0.812887
I00017 0.732683
I00018 0.742017
