In [44]:
%%capture
%load_ext autoreload
%autoreload 1

In [45]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from recsys_pipeliner.recommendations.transformer import (
    UserItemMatrixTransformer,
    SimilarityTransformer,
)
from recsys_pipeliner.recommendations.recommender import UserBasedRecommender

In [46]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv(f"../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,0.8
1,U00001,I00002,0.4
2,U00001,I00003,0.6
3,U00001,I00004,0.2
4,U00001,I00005,1.0


In [47]:
user_item_matrix_pd = (
    user_item_ratings.groupby(["user_id", "item_id"])["rating"]
    .agg("sum")
    .unstack()
    .fillna(0.0)
)
user_item_matrix_pd.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,0.8,0.4,0.6,0.2,1.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0
U00002,0.0,0.81,0.41,0.61,0.21,1.0,0.81,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,0.82,0.42,0.62,0.22,1.0,0.82,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,0.83,0.43,0.63,0.23,1.0,0.83,0.0,0.0,0.0
U00005,0.0,0.0,0.0,0.0,0.84,0.44,0.64,0.24,1.0,0.84,0.0,0.0


In [48]:
unique_users = user_item_matrix_pd.index.to_numpy()
unique_items = user_item_matrix_pd.columns.to_numpy()

unique_users, unique_items

(array(['U00001', 'U00002', 'U00003', 'U00004', 'U00005', 'U00006',
        'U00007', 'U00008', 'U00009', 'U00010', 'U00011', 'U00012'],
       dtype=object),
 array(['I00001', 'I00002', 'I00003', 'I00004', 'I00005', 'I00006',
        'I00007', 'I00008', 'I00009', 'I00010', 'I00011', 'I00012'],
       dtype=object))

In [49]:
df = pd.DataFrame(
    cosine_similarity(user_item_matrix_pd),
    index=user_item_matrix_pd.index,
    columns=user_item_matrix_pd.index,
)
user_similarity_matrix_pd = (df - df.min()) / (df.max() - df.min()).round(6).astype(
    np.float32
)
user_similarity_matrix_pd.head(5)

user_id,U00001,U00002,U00003,U00004,U00005,U00006,U00007,U00008,U00009,U00010,U00011,U00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,1.0,0.593422,0.473685,0.376071,0.403558,0.227984,0.0,0.228862,0.375167,0.37609,0.466427,0.629074
U00002,0.593422,1.0,0.599157,0.478312,0.380428,0.403195,0.229023,0.0,0.229859,0.375247,0.38024,0.471035
U00003,0.473685,0.599157,1.0,0.604672,0.482801,0.384623,0.402779,0.230013,0.0,0.230808,0.375268,0.384238
U00004,0.376071,0.478312,0.604672,1.0,0.609972,0.487156,0.388662,0.402315,0.230955,0.0,0.231711,0.375235
U00005,0.403558,0.380428,0.482801,0.609972,1.0,0.615067,0.491379,0.392551,0.401804,0.231852,0.0,0.232571


Calcuate the recommendations manually in Pandas.

In [50]:
# get the 10 most similar users
user_id = "U00001"

similar_users_pd = (user_similarity_matrix_pd[user_id]
                 .drop(user_id, errors="ignore")
                 .sort_values(ascending=False))
similar_users_pd = similar_users_pd[similar_users_pd > 0].head(10)

similar_users_pd

user_id
U00012    0.629074
U00002    0.593422
U00003    0.473685
U00011    0.466427
U00005    0.403558
U00010    0.376090
U00004    0.376071
U00009    0.375167
U00008    0.228862
U00006    0.227984
Name: U00001, dtype: float64

In [51]:
# get all the items that the user has already rated
single_user_matrix_pd = user_item_matrix_pd.loc[user_id]
user_rated_items_pd = single_user_matrix_pd[single_user_matrix_pd > 0].sort_values(
    ascending=False, kind="stable"
)
user_rated_items_pd.index.to_list()

['I00005', 'I00001', 'I00006', 'I00003', 'I00002', 'I00004']

In [52]:
# get all ratings by similar users
# exclude items that the users has alrady rated
# sort by rating

matrix = user_item_matrix_pd.T[similar_users_pd.head(10).index]

filtered_user_item_matrix_pd = matrix[
    ~matrix.index.isin(user_rated_items_pd.index) & (matrix > 0).any(axis="columns")
]

user_recommendations = filtered_user_item_matrix_pd.mean(axis=1).sort_values(
    ascending=False, kind="stable"
)

# double check the user hasn't rated the items
# this should be an empty series
assert (
    set(user_rated_items_pd.index.to_list()).intersection(
        set(user_recommendations.index.to_list())
    )
    == set()
)

# get the top 10
user_recs_pd1 = user_recommendations.head(10).index.to_list()
user_recs_pd1

['I00010', 'I00008', 'I00009', 'I00012', 'I00011', 'I00007']

Numpy/Scipy implementation

In [53]:
# encode the user and item ids
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_item_ratings_encoded = user_item_ratings.copy() 

user_item_ratings_encoded["user_id"] = user_encoder.fit_transform(user_item_ratings_encoded["user_id"])
user_item_ratings_encoded["item_id"] = item_encoder.fit_transform(user_item_ratings_encoded["item_id"])

user_item_ratings_np = user_item_ratings_encoded.to_numpy().astype(np.float32)

unique_users = pd.Series(user_encoder.classes_)
unique_items = pd.Series(item_encoder.classes_)

print(unique_users.shape[0], unique_items.shape[0])
user_item_ratings_encoded.head(3)

12 12


Unnamed: 0,user_id,item_id,rating
0,0,0,0.8
1,0,1,0.4
2,0,2,0.6


In [54]:
user_item_matrix_transformer_np = UserItemMatrixTransformer()
user_item_matrix_np = user_item_matrix_transformer_np.transform(user_item_ratings_np)

assert isinstance(user_item_matrix_np, sp.sparse.sparray)

In [55]:
# sense check
display(pd.DataFrame(user_item_matrix_np.toarray(), columns=unique_items, index=unique_users))
display(user_item_matrix_pd)

np.testing.assert_array_almost_equal(user_item_matrix_np.toarray(), user_item_matrix_pd.to_numpy())

Unnamed: 0,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
U00001,0.8,0.4,0.6,0.2,1.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0
U00002,0.0,0.81,0.41,0.61,0.21,1.0,0.81,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,0.82,0.42,0.62,0.22,1.0,0.82,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,0.83,0.43,0.63,0.23,1.0,0.83,0.0,0.0,0.0
U00005,0.0,0.0,0.0,0.0,0.84,0.44,0.64,0.24,1.0,0.84,0.0,0.0
U00006,0.0,0.0,0.0,0.0,0.0,0.85,0.45,0.65,0.25,1.0,0.85,0.0
U00007,0.0,0.0,0.0,0.0,0.0,0.0,0.86,0.46,0.66,0.26,1.0,0.86
U00008,0.87,0.0,0.0,0.0,0.0,0.0,0.0,0.87,0.47,0.67,0.27,1.0
U00009,1.0,0.88,0.0,0.0,0.0,0.0,0.0,0.0,0.88,0.48,0.68,0.28
U00010,0.29,1.0,0.89,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.49,0.69


item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,0.8,0.4,0.6,0.2,1.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0
U00002,0.0,0.81,0.41,0.61,0.21,1.0,0.81,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,0.82,0.42,0.62,0.22,1.0,0.82,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,0.83,0.43,0.63,0.23,1.0,0.83,0.0,0.0,0.0
U00005,0.0,0.0,0.0,0.0,0.84,0.44,0.64,0.24,1.0,0.84,0.0,0.0
U00006,0.0,0.0,0.0,0.0,0.0,0.85,0.45,0.65,0.25,1.0,0.85,0.0
U00007,0.0,0.0,0.0,0.0,0.0,0.0,0.86,0.46,0.66,0.26,1.0,0.86
U00008,0.87,0.0,0.0,0.0,0.0,0.0,0.0,0.87,0.47,0.67,0.27,1.0
U00009,1.0,0.88,0.0,0.0,0.0,0.0,0.0,0.0,0.88,0.48,0.68,0.28
U00010,0.29,1.0,0.89,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.49,0.69


In [56]:
user_similarity_transformer_np = SimilarityTransformer()
user_similarity_matrix_np = user_similarity_transformer_np.transform(user_item_matrix_np).astype(np.float32)

assert isinstance(user_similarity_matrix_np, sp.sparse.sparray)

In [57]:
# sense check
display(
    pd.DataFrame(
        user_similarity_matrix_np.toarray(), columns=unique_users, index=unique_users
    )
)
display(user_similarity_matrix_pd)

np.testing.assert_array_almost_equal(
    user_similarity_matrix_np.toarray().astype(np.float32).round(6),
    user_similarity_matrix_pd.to_numpy().astype(np.float32).round(6),
)


Unnamed: 0,U00001,U00002,U00003,U00004,U00005,U00006,U00007,U00008,U00009,U00010,U00011,U00012
U00001,1.0,0.593422,0.473685,0.376071,0.403558,0.227984,0.0,0.228862,0.375167,0.37609,0.466428,0.629074
U00002,0.593422,1.0,0.599157,0.478312,0.380428,0.403195,0.229023,0.0,0.229859,0.375247,0.38024,0.471035
U00003,0.473685,0.599157,1.0,0.604672,0.482801,0.384623,0.402779,0.230013,0.0,0.230808,0.375268,0.384238
U00004,0.376071,0.478312,0.604672,1.0,0.609972,0.487156,0.388662,0.402315,0.230955,0.0,0.231711,0.375235
U00005,0.403558,0.380428,0.482801,0.609972,1.0,0.615067,0.49138,0.392551,0.401804,0.231852,0.0,0.232571
U00006,0.227984,0.403195,0.384623,0.487156,0.615067,1.0,0.619961,0.495476,0.396293,0.401251,0.232705,0.0
U00007,0.0,0.229023,0.402779,0.388662,0.49138,0.619961,1.0,0.624663,0.499449,0.399894,0.400659,0.233516
U00008,0.228862,0.0,0.230013,0.402315,0.392551,0.495476,0.624663,1.0,0.629179,0.503301,0.403359,0.400029
U00009,0.375167,0.229859,0.0,0.230955,0.401804,0.396293,0.499449,0.629179,1.0,0.633514,0.507036,0.406692
U00010,0.37609,0.375247,0.230808,0.0,0.231852,0.401251,0.399894,0.503301,0.633514,1.0,0.637677,0.510657


user_id,U00001,U00002,U00003,U00004,U00005,U00006,U00007,U00008,U00009,U00010,U00011,U00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,1.0,0.593422,0.473685,0.376071,0.403558,0.227984,0.0,0.228862,0.375167,0.37609,0.466427,0.629074
U00002,0.593422,1.0,0.599157,0.478312,0.380428,0.403195,0.229023,0.0,0.229859,0.375247,0.38024,0.471035
U00003,0.473685,0.599157,1.0,0.604672,0.482801,0.384623,0.402779,0.230013,0.0,0.230808,0.375268,0.384238
U00004,0.376071,0.478312,0.604672,1.0,0.609972,0.487156,0.388662,0.402315,0.230955,0.0,0.231711,0.375235
U00005,0.403558,0.380428,0.482801,0.609972,1.0,0.615067,0.491379,0.392551,0.401804,0.231852,0.0,0.232571
U00006,0.227984,0.403195,0.384623,0.487156,0.615067,1.0,0.619961,0.495476,0.396293,0.401251,0.232705,0.0
U00007,0.0,0.229023,0.402779,0.388662,0.491379,0.619961,1.0,0.624663,0.499449,0.399894,0.400659,0.233516
U00008,0.228862,0.0,0.230013,0.402315,0.392551,0.495476,0.624663,1.0,0.629179,0.503301,0.403359,0.400029
U00009,0.375167,0.229859,0.0,0.230955,0.401804,0.396293,0.499449,0.629179,1.0,0.633514,0.507036,0.406692
U00010,0.37609,0.375247,0.230808,0.0,0.231852,0.401251,0.399894,0.503301,0.633514,1.0,0.637677,0.510657


In [58]:
# get the 10 most similar users
user_id_encoded = user_encoder.transform([user_id])[0]

single_user_similarity_matrix_np = user_similarity_matrix_np[[user_id_encoded]]

excluded = single_user_similarity_matrix_np > 0
excluded[[0], [user_id_encoded]] = False

user_mask = excluded.toarray()[0]
user_sorter = np.argsort(1 - single_user_similarity_matrix_np.toarray()[0], kind="stable")
sorted_mask = user_mask[user_sorter]
similar_users_np = user_sorter[sorted_mask][:10]

np.testing.assert_array_equal(
    similar_users_np,
    user_encoder.transform(similar_users_pd.index.to_numpy()),
)

In [59]:
user_indices = np.arange(user_item_matrix_np.shape[0])
item_indices = np.arange(user_item_matrix_np.shape[1])

# get all the items that the user has already rated
single_user_ratings_np = user_item_matrix_np[
    [user_id_encoded]
]
users_rated_items_np = (single_user_ratings_np > 0).nonzero()[1]
users_rated_items_decoded = item_encoder.inverse_transform(users_rated_items_np)

np.testing.assert_array_equal(
    users_rated_items_decoded, 
    np.sort(user_rated_items_pd.index.to_numpy(), kind="stable"), 
    verbose=True
)

similar_users_user_item_matrix_np = user_item_matrix_np[similar_users_np]

any_ratings = np.nonzero(similar_users_user_item_matrix_np.sum(axis=0))[0]
items_to_use = np.setdiff1d(any_ratings, users_rated_items_np)

filtered_user_item_matrix_np = similar_users_user_item_matrix_np[:, items_to_use]

display(filtered_user_item_matrix_pd)

tmp_pd = pd.DataFrame(
    filtered_user_item_matrix_np.toarray().T,
    columns=user_encoder.inverse_transform(similar_users_np),
    index=item_encoder.inverse_transform(items_to_use),
)
display(tmp_pd)
print(filtered_user_item_matrix_pd.mean(axis=1).sort_values(ascending=False, kind="stable"))
print(tmp_pd.mean(axis=1).sort_values(ascending=False, kind="stable"))

item_mean_ratings = filtered_user_item_matrix_np.toarray().T.mean(axis=1)
item_sorter = np.argsort(1 - item_mean_ratings, kind="stable")

user_recs_np1 = item_encoder.inverse_transform(items_to_use[item_sorter][:10])

np.testing.assert_array_equal(user_recs_pd1, user_recs_np1)

user_id,U00012,U00002,U00003,U00011,U00005,U00010,U00004,U00009,U00008,U00006
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
I00007,0.0,0.81,1.0,0.0,0.64,0.0,0.23,0.0,0.0,0.45
I00008,0.0,0.0,0.82,0.0,0.24,0.0,1.0,0.0,0.87,0.65
I00009,0.0,0.0,0.0,0.0,1.0,0.0,0.83,0.88,0.47,0.25
I00010,0.0,0.0,0.0,0.0,0.84,0.89,0.0,0.48,0.67,1.0
I00011,0.0,0.0,0.0,0.9,0.0,0.49,0.0,0.68,0.27,0.85
I00012,0.91,0.0,0.0,0.5,0.0,0.69,0.0,0.28,1.0,0.0


Unnamed: 0,U00012,U00002,U00003,U00011,U00005,U00010,U00004,U00009,U00008,U00006
I00007,0.0,0.81,1.0,0.0,0.64,0.0,0.23,0.0,0.0,0.45
I00008,0.0,0.0,0.82,0.0,0.24,0.0,1.0,0.0,0.87,0.65
I00009,0.0,0.0,0.0,0.0,1.0,0.0,0.83,0.88,0.47,0.25
I00010,0.0,0.0,0.0,0.0,0.84,0.89,0.0,0.48,0.67,1.0
I00011,0.0,0.0,0.0,0.9,0.0,0.49,0.0,0.68,0.27,0.85
I00012,0.91,0.0,0.0,0.5,0.0,0.69,0.0,0.28,1.0,0.0


item_id
I00010    0.388
I00008    0.358
I00009    0.343
I00012    0.338
I00011    0.319
I00007    0.313
dtype: float64
I00010    0.388
I00008    0.358
I00009    0.343
I00012    0.338
I00011    0.319
I00007    0.313
dtype: float32


In [60]:
# produce the same results uisng the library (np implementation) - they should be the same

rec_np = UserBasedRecommender(10, 10)

rec_np.fit(user_item_matrix_np)

recs_sp = rec_np.recommend([user_id_encoded])[0]

np.testing.assert_array_equal(item_encoder.transform(user_recs_pd1), recs_sp)
np.testing.assert_array_equal(item_encoder.transform(user_recs_np1), recs_sp)