In [10]:
%%capture
%load_ext autoreload
%autoreload 1

In [11]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from pipeliner.recommendations.transformer import (
    SimilarityTransformer,
    UserItemMatrixTransformer,
)
from pipeliner.recommendations.recommender import SimilarityRecommender

In [12]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv("../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(3)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,0.8
1,U00001,I00002,0.4
2,U00001,I00003,0.6


In [13]:
# encode the item ids
item_encoder = LabelEncoder()
user_encoder = LabelEncoder()

user_item_ratings["item_id"] = item_encoder.fit_transform(
    user_item_ratings["item_id"]
)
user_item_ratings["user_id"] = user_encoder.fit_transform(
    user_item_ratings["user_id"]
)

unique_users = pd.Series(user_encoder.classes_)
unique_items = pd.Series(item_encoder.classes_)

print(unique_users.shape, unique_items.shape)

user_item_ratings.head(3)


(12,) (24,)


Unnamed: 0,user_id,item_id,rating
0,0,0,0.8
1,0,1,0.4
2,0,2,0.6


In [14]:
user_item_matrix_transformer = UserItemMatrixTransformer()

user_item_matrix = user_item_matrix_transformer.transform(
    user_item_ratings.to_numpy(),
)

# check ratings from matrix are correct
users = user_item_ratings["user_id"].to_numpy().astype(int)
items = user_item_ratings["item_id"].to_numpy().astype(int)
ratings = user_item_ratings["rating"].to_numpy().astype(np.float32)
for user, item, rating in zip(users, items, ratings):
    assert user_item_matrix[user, item] == rating

user_item_matrix.shape

(12, 24)

In [15]:
item_similarity_matrix_transformer = SimilarityTransformer()
item_similarity_matrix = item_similarity_matrix_transformer.transform(
    user_item_matrix.T
)

Get generic recommendations for a user:
1. Get their top k rated items
2. Find similar items to those they rated highly
3. Generate recommendations based on item similarity and rating

In [16]:
user_id = "U00003"
user_idx = user_encoder.transform([user_id])[0]
n=5
k=10

_, users_rated_items, users_ratings = sp.sparse.find(user_item_matrix[user_idx, :])

# get the top k rated items
top_k_users_rated_items = np.argsort(1 - users_ratings)[:k]
# and all items not rated by the user
users_unrated_items = np.setdiff1d(np.arange(item_similarity_matrix.shape[0]), top_k_users_rated_items)

print("rated", top_k_users_rated_items)
print("unrated", users_unrated_items)

# filter the similarity matrix (rows=rated, cols=unrated)
single_user_item_similarity_matrix = item_similarity_matrix[top_k_users_rated_items[:, None], users_unrated_items]

# calculate the mena similarity of all the user's unrated items
mean_similarity = single_user_item_similarity_matrix.mean(axis=0)
print("mean_similarity", mean_similarity)

# sort get the top n most similar items
similarity_selector = np.argsort(1 - mean_similarity)[:n]
sorted_similarity = mean_similarity[similarity_selector]
items = users_unrated_items[similarity_selector]
recommendations = item_encoder.inverse_transform(items)

recommendations

rated [4 0 5 2 1 3]
unrated [ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
mean_similarity [0.26188383 0.27269283 0.1179595  0.14631233 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.15422617 0.12304533 0.35654083 0.26228583]


array(['I00023', 'I00008', 'I00024', 'I00007', 'I00021'], dtype=object)

## Item-based prediction

Given a user_id and an item_id:

1. Get all items the user has rated
2. Sort by similarity to item_id
3. Get top k
4. Calculate weighted average rating
5. Return estimated score

In [17]:
item_id = "I00010"
item_idx = item_encoder.transform([item_id])[0]
k=5

print("user_idx", user_idx)
print("item_idx", item_idx)

_, users_rated_items, users_ratings = sp.sparse.find(user_item_matrix[user_idx, :])

print("users_rated_items", users_rated_items)
print("users_ratings", users_ratings)

# get the similarities to item_id
item_similarities = item_similarity_matrix[:, users_rated_items][item_idx].toarray().astype(np.float32).round(6)
print("item_similarities", item_similarities)

# sort by similarity (desc) and get top k
top_k_mask = np.argsort(1 - item_similarities)[:k]
print("top_k_mask", top_k_mask)

top_k_users_rated_items = users_rated_items[top_k_mask]
top_k_user_ratings = users_ratings[top_k_mask]
top_k_rated_item_similarities = item_similarities[top_k_mask]

print("top_k_users_rated_items", top_k_users_rated_items)
print("top_k_user_ratings", top_k_user_ratings)
print("top_k_rated_item_similarities", top_k_rated_item_similarities)

# weighted average rating
predicted_score = np.average(top_k_user_ratings, axis=0, weights=top_k_rated_item_similarities).astype(np.float32).round(6)
print("predicted_score", predicted_score)

user_idx 2
item_idx 9
users_rated_items [4 5 6 7 8 9]
users_ratings [0.82 0.42 0.62 0.22 1.   0.82]
item_similarities [0.490569 0.387305 0.506639 0.30895  0.9601   1.      ]
top_k_mask [5 4 2 0 1]
top_k_users_rated_items [9 8 6 4 5]
top_k_user_ratings [0.82 1.   0.62 0.82 0.42]
top_k_rated_item_similarities [1.       0.9601   0.506639 0.490569 0.387305]
predicted_score 0.795055


## User-based prediction

1. Get users who have rated item_id

In [18]:
user_id = "U00003"
user_idx = user_encoder.transform([user_id])[0]
n=5
k=10

print("user_idx", user_idx)
print("item_idx", item_idx)

_, users, users_ratings = sp.sparse.find(user_item_matrix[:, item_idx])

print("users", users)
print("users_ratings", users_ratings)

user_similarity_matrix_transformer = SimilarityTransformer()
user_similarity_matrix = user_similarity_matrix_transformer.transform(
    user_item_matrix
)

print("user_similarity_matrix", user_similarity_matrix.shape)


# this isn't right
_, similar_users, user_similarities = sp.sparse.find(user_similarity_matrix[user_idx, users])

print("similar_users", similar_users)
print("user_similarities", user_similarities)

# # sort by similarity (desc) and get top k
# # excluding 0 index as this is the current users
# top_k_mask = np.argsort(1 - user_similarities)[1:k+1]
# print("top_k_mask", top_k_mask)

# top_k_users = users[top_k_mask]
# top_k_user_similarities = user_similarities[top_k_mask]

# top_k_users, top_k_user_similarities


user_idx 2
item_idx 9
users [2 3 4]
users_ratings [0.82 0.23 0.44]
user_similarity_matrix (12, 12)
similar_users [0 1 2]
user_similarities [1.       0.478629 0.398616]
