In [1]:
%%capture
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from pipeliner.recommendations.transformer import (
    SimilarityTransformer,
    UserItemMatrixTransformer,
)
from pipeliner.recommendations.recommender import (
    SimilarityRecommender,
    ItemBasedRecommender,
    UserBasedRecommender,
)

In [3]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv("../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(3)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,0.8
1,U00001,I00002,0.4
2,U00001,I00003,0.6


In [4]:
# encode the item ids
item_encoder = LabelEncoder()
user_encoder = LabelEncoder()

user_item_ratings["item_id"] = item_encoder.fit_transform(
    user_item_ratings["item_id"]
)
user_item_ratings["user_id"] = user_encoder.fit_transform(
    user_item_ratings["user_id"]
)

unique_users = pd.Series(user_encoder.classes_)
unique_items = pd.Series(item_encoder.classes_)

print(unique_users.shape, unique_items.shape)

user_item_ratings.head(3)


(12,) (24,)


Unnamed: 0,user_id,item_id,rating
0,0,0,0.8
1,0,1,0.4
2,0,2,0.6


In [5]:
user_item_matrix_transformer = UserItemMatrixTransformer()

user_item_matrix = user_item_matrix_transformer.transform(
    user_item_ratings.to_numpy(),
)

# check ratings from matrix are correct
users = user_item_ratings["user_id"].to_numpy().astype(int)
items = user_item_ratings["item_id"].to_numpy().astype(int)
ratings = user_item_ratings["rating"].to_numpy().astype(np.float32)
for user, item, rating in zip(users, items, ratings):
    assert user_item_matrix[user, item] == rating

user_item_matrix.shape

(12, 24)

In [6]:
item_similarity_matrix_transformer = SimilarityTransformer()
item_similarity_matrix = item_similarity_matrix_transformer.transform(
    user_item_matrix.T
)

user_similarity_matrix_transformer = SimilarityTransformer()
user_similarity_matrix = user_similarity_matrix_transformer.transform(
    user_item_matrix
)

item_similarity_matrix.shape, user_similarity_matrix.shape

((24, 24), (12, 12))

Get generic recommendations for a user:
1. Get their top k rated items
2. Find similar items to those they rated highly
3. Generate recommendations based on item similarity and rating

In [7]:
user_id = "U00003"
user_idx = user_encoder.transform([user_id])[0]
n=5
k=10

_, users_rated_items, users_ratings = sp.sparse.find(user_item_matrix[user_idx, :])

# get the top k rated items
top_k_users_rated_items = np.argsort(1 - users_ratings)[:k]
# and all items not rated by the user
users_unrated_items = np.setdiff1d(np.arange(item_similarity_matrix.shape[0]), top_k_users_rated_items)

print("rated", top_k_users_rated_items)
print("unrated", users_unrated_items)

# filter the similarity matrix (rows=rated, cols=unrated)
single_user_item_similarity_matrix = item_similarity_matrix[top_k_users_rated_items[:, None], users_unrated_items]

# calculate the mena similarity of all the user's unrated items
mean_similarity = single_user_item_similarity_matrix.mean(axis=0)
print("mean_similarity", mean_similarity)

# sort get the top n most similar items
similarity_selector = np.argsort(1 - mean_similarity)[:n]
sorted_similarity = mean_similarity[similarity_selector]
items = users_unrated_items[similarity_selector]
recommendations = item_encoder.inverse_transform(items)

recommendations

rated [4 0 5 2 1 3]
unrated [ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
mean_similarity [0.26188383 0.27269283 0.1179595  0.14631233 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.15422617 0.12304533 0.35654083 0.26228583]


array(['I00023', 'I00008', 'I00024', 'I00007', 'I00021'], dtype=object)

## Item-based prediction

Given a user_id and an item_id:

1. Get all items the user has rated
2. Sort by similarity to item_id
3. Get top k
4. Calculate weighted average rating
5. Return estimated score

In [8]:
item_id = "I00010"
item_idx = item_encoder.transform([item_id])[0]
k=5

print("user_idx", user_idx)
print("item_idx", item_idx)

_, users_rated_items, users_ratings = sp.sparse.find(user_item_matrix[user_idx, :])

print("users_rated_items", users_rated_items)
print("users_ratings", users_ratings)

# get the similarities to item_id
item_similarities = item_similarity_matrix[:, users_rated_items][item_idx].toarray().astype(np.float32).round(6)
print("item_similarities", item_similarities)

# sort by similarity (desc) and get top k
top_k_mask = np.argsort(1 - item_similarities)[1:k+1]
print("top_k_mask", top_k_mask)

top_k_users_rated_items = users_rated_items[top_k_mask]
top_k_user_ratings = users_ratings[top_k_mask]
top_k_rated_item_similarities = item_similarities[top_k_mask]

print("top_k_users_rated_items", top_k_users_rated_items)
print("top_k_user_ratings", top_k_user_ratings)
print("top_k_rated_item_similarities", top_k_rated_item_similarities)

# weighted average rating
predicted_score = np.average(top_k_user_ratings, axis=0, weights=top_k_rated_item_similarities).astype(np.float32).round(6)
print("predicted_score", predicted_score)

user_idx 2
item_idx 9
users_rated_items [4 5 6 7 8 9]
users_ratings [0.82 0.42 0.62 0.22 1.   0.82]
item_similarities [0.490569 0.387305 0.506639 0.30895  0.9601   1.      ]
top_k_mask [4 2 0 1 3]
top_k_users_rated_items [8 6 4 5 7]
top_k_user_ratings [1.   0.62 0.82 0.42 0.22]
top_k_rated_item_similarities [0.9601   0.506639 0.490569 0.387305 0.30895 ]
predicted_score 0.718702


In [9]:
item_based_recommender = ItemBasedRecommender(k=5)
item_based_recommender.fit(user_item_matrix)

for id in users_unrated_items[:10]:
    prediction = item_based_recommender.predict(user_idx, id)
    item_id = item_encoder.inverse_transform([id])[0]
    print(item_id, prediction)

I00007 0.607503
I00008 0.718121
I00009 0.64528
I00010 0.718702
I00011 0.5717
I00012 0.543242
I00013 0.819996
I00014 0.819997
I00015 0.616
I00016 0.616


## User-based prediction

1. Get users who have rated item_id
2. Sort by similarity to user_id
3. Get top k
4. Get those users' rating of item_id
5. Calculate weighted average rating
6. Return estimated score

In [10]:
user_id = "U00003"
user_idx = user_encoder.transform([user_id])[0]
n=5
k=10

print("user_idx", user_idx)
print("item_idx", item_idx)

_, users, users_ratings = sp.sparse.find(user_item_matrix[:, item_idx])

print("users", users)
print("users_ratings", users_ratings)

# get the similarities to user_id
_, similar_users, user_similarities = sp.sparse.find(user_similarity_matrix[user_idx, users])

print("similar_users", similar_users)
print("user_similarities", user_similarities)

# sort by similarity (desc) and get top k
top_k_mask = np.argsort(1 - user_similarities)[1:k+1]
print("top_k_mask", top_k_mask)

top_k_users = users[top_k_mask]
top_k_users_ratings = users_ratings[top_k_mask]
top_k_users_similarities = user_similarities[top_k_mask]

print("top_k_users", top_k_users)
print("top_k_users_ratings", top_k_users_ratings)
print("top_k_users_similarities", top_k_users_similarities)

# weighted average rating
predicted_score = np.average(top_k_users_ratings, axis=0, weights=top_k_users_similarities).astype(np.float32).round(6)
print("predicted_score", predicted_score)

user_idx 2
item_idx 9
users [2 3 4]
users_ratings [0.82 0.23 0.44]
similar_users [0 1 2]
user_similarities [1.       0.478629 0.398616]
top_k_mask [1 2]
top_k_users [3 4]
top_k_users_ratings [0.23 0.44]
top_k_users_similarities [0.478629 0.398616]
predicted_score 0.325423


In [11]:
user_based_recommender = UserBasedRecommender(k=5)
user_based_recommender.fit(user_item_matrix)

for id in users_unrated_items[:10]:
    prediction = user_based_recommender.predict(user_idx, id)
    item_id = item_encoder.inverse_transform([id])[0]
    print(item_id, prediction)

2 6
users [1 2 3]
users_ratings [1.   0.62 0.83]
similar_users [0 1 2]
user_similarities [0.474034 1.       0.478629]
top_k_mask [2 0]
top_k_users [3 1]
top_k_users_ratings [0.83 1.  ]
top_k_users_similarities [0.478629 0.474034]
predicted_score 0.91459
I00007 0.91459
2 7
users [1 2 3]
users_ratings [0.81 0.22 0.43]
similar_users [0 1 2]
user_similarities [0.474034 1.       0.478629]
top_k_mask [2 0]
top_k_users [3 1]
top_k_users_ratings [0.43 0.81]
top_k_users_similarities [0.478629 0.474034]
predicted_score 0.619084
I00008 0.619084
2 8
users [2 3 4]
users_ratings [1.   0.63 0.84]
similar_users [0 1 2]
user_similarities [1.       0.478629 0.398616]
top_k_mask [1 2]
top_k_users [3 4]
top_k_users_ratings [0.63 0.84]
top_k_users_similarities [0.478629 0.398616]
predicted_score 0.725423
I00009 0.725423
2 9
users [2 3 4]
users_ratings [0.82 0.23 0.44]
similar_users [0 1 2]
user_similarities [1.       0.478629 0.398616]
top_k_mask [1 2]
top_k_users [3 4]
top_k_users_ratings [0.23 0.44]
top_

ZeroDivisionError: Weights sum to zero, can't be normalized