In [1]:
%%capture
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from pipeliner.recommendations.transformer import (
    UserItemMatrixTransformer,
    SimilarityTransformer,
)
from pipeliner.recommendations.recommender import UserBasedRecommender

In [3]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv(f"../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,1.0
1,U00001,I00002,0.5
2,U00001,I00003,0.5
3,U00001,I00004,0.25
4,U00002,I00002,1.0


In [4]:
user_item_matrix_pd = (
    user_item_ratings.groupby(["user_id", "item_id"])["rating"]
    .agg("sum")
    .unstack()
    .fillna(0.0)
)
user_item_matrix_pd.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00002,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0
U00005,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0


In [5]:
unique_users = user_item_matrix_pd.index.to_numpy()
unique_items = user_item_matrix_pd.columns.to_numpy()

unique_users, unique_items

(array(['U00001', 'U00002', 'U00003', 'U00004', 'U00005', 'U00006',
        'U00007', 'U00008', 'U00009', 'U00010', 'U00011', 'U00012'],
       dtype=object),
 array(['I00001', 'I00002', 'I00003', 'I00004', 'I00005', 'I00006',
        'I00007', 'I00008', 'I00009', 'I00010', 'I00011', 'I00012'],
       dtype=object))

In [6]:
df = pd.DataFrame(
    cosine_similarity(user_item_matrix_pd),
    index=user_item_matrix_pd.index,
    columns=user_item_matrix_pd.index,
)
user_similarity_matrix_pd = (df - df.min()) / (df.max() - df.min()).round(6).astype(
    np.float32
)
user_similarity_matrix_pd.head(5)

user_id,U00001,U00002,U00003,U00004,U00005,U00006,U00007,U00008,U00009,U00010,U00011,U00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4,0.56
U00002,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4
U00003,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16
U00004,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0
U00005,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0


Calcuate the recommendations manually in Pandas.

In [7]:
# get the 10 most similar users
user_id = "U00003"

similar_users_pd = (user_similarity_matrix_pd[user_id]
                 .drop(user_id, errors="ignore")
                 .sort_values(ascending=False))
similar_users_pd = similar_users_pd[similar_users_pd > 0].head(10)

similar_users_pd

user_id
U00002    0.56
U00004    0.56
U00001    0.40
U00005    0.40
U00006    0.16
U00012    0.16
Name: U00003, dtype: float64

In [8]:
# get all the items that the user has already rated
single_user_matrix_pd = user_item_matrix_pd.loc[user_id]
user_rated_items_pd = single_user_matrix_pd[single_user_matrix_pd > 0].sort_values(
    ascending=False
)
user_rated_items_pd.index.to_list()

['I00003', 'I00004', 'I00005', 'I00006']

In [9]:
# get all ratings by similar users
# exclude items that the users has alrady rated
# sort by rating

matrix = user_item_matrix_pd.T[similar_users_pd.head(10).index]

filtered_user_item_matrix_pd = matrix[
    ~matrix.index.isin(user_rated_items_pd.index) & (matrix > 0).any(axis="columns")
]

user_recommendations = filtered_user_item_matrix_pd.mean(axis=1).sort_values(
    ascending=False
)

# double check the user hasn't rated the items
# this should be an empty series
assert (
    set(user_rated_items_pd.index.to_list()).intersection(
        set(user_recommendations.index.to_list())
    )
    == set()
)

# get the top 10
user_recs_pd1 = user_recommendations.head(10).index.to_list()
user_recs_pd1

['I00002', 'I00001', 'I00007', 'I00012', 'I00008', 'I00009']

Numpy/Scipy implementation

In [10]:
# encode the user and item ids
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_item_ratings_encoded = user_item_ratings.copy() 

user_item_ratings_encoded["user_id"] = user_encoder.fit_transform(user_item_ratings_encoded["user_id"])
user_item_ratings_encoded["item_id"] = item_encoder.fit_transform(user_item_ratings_encoded["item_id"])

user_item_ratings_np = user_item_ratings_encoded.to_numpy().astype(np.float32)

unique_users = pd.Series(user_encoder.classes_)
unique_items = pd.Series(item_encoder.classes_)

print(unique_users.shape[0], unique_items.shape[0])
user_item_ratings_encoded.head(3)

12 12


Unnamed: 0,user_id,item_id,rating
0,0,0,1.0
1,0,1,0.5
2,0,2,0.5


In [11]:
user_item_matrix_transformer_np = UserItemMatrixTransformer()
user_item_matrix_np = user_item_matrix_transformer_np.transform(user_item_ratings_np)

assert isinstance(user_item_matrix_np, sp.sparse.sparray)

In [12]:
# sense check
display(pd.DataFrame(user_item_matrix_np.toarray(), columns=unique_items, index=unique_users))
display(user_item_matrix_pd)

np.testing.assert_array_equal(user_item_matrix_np.toarray(), user_item_matrix_pd.to_numpy())

Unnamed: 0,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
U00001,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00002,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0
U00005,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0
U00006,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0
U00007,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0
U00008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0
U00009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25
U00010,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5


item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00002,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0
U00005,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0
U00006,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0
U00007,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0
U00008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0
U00009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25
U00010,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.5


In [13]:
user_similarity_transformer_np = SimilarityTransformer()
user_similarity_matrix_np = user_similarity_transformer_np.transform(user_item_matrix_np).astype(np.float32)

assert isinstance(user_similarity_matrix_np, sp.sparse.sparray)

In [14]:
# sense check
display(
    pd.DataFrame(
        user_similarity_matrix_np.toarray(), columns=unique_users, index=unique_users
    )
)
display(user_similarity_matrix_pd)

np.testing.assert_array_equal(
    user_similarity_matrix_np.toarray().astype(np.float32).round(6),
    user_similarity_matrix_pd.to_numpy().astype(np.float32).round(6),
)


Unnamed: 0,U00001,U00002,U00003,U00004,U00005,U00006,U00007,U00008,U00009,U00010,U00011,U00012
U00001,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4,0.56
U00002,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4
U00003,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16
U00004,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0
U00005,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0
U00006,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0
U00007,0.0,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0
U00008,0.0,0.0,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0
U00009,0.0,0.0,0.0,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16
U00010,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4


user_id,U00001,U00002,U00003,U00004,U00005,U00006,U00007,U00008,U00009,U00010,U00011,U00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4,0.56
U00002,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4
U00003,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16
U00004,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0
U00005,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0
U00006,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0
U00007,0.0,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0
U00008,0.0,0.0,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0
U00009,0.0,0.0,0.0,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16
U00010,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4,0.56,1.0,0.56,0.4


In [15]:
# get the 10 most similar users
user_id_encoded = user_encoder.transform([user_id])[0]

single_user_similarity_matrix_np = user_similarity_matrix_np[[user_id_encoded]]

excluded = single_user_similarity_matrix_np > 0
excluded[[0], [user_id_encoded]] = False

user_mask = excluded.toarray()[0]
user_sorter = np.argsort(1 - single_user_similarity_matrix_np.toarray()[0], kind="stable")
sorted_mask = user_mask[user_sorter]
similar_users_np = user_sorter[sorted_mask][:10]

np.testing.assert_array_equal(
    similar_users_np,
    user_encoder.transform(similar_users_pd.index.to_numpy()),
)

In [16]:
user_indices = np.arange(user_item_matrix_np.shape[0])
item_indices = np.arange(user_item_matrix_np.shape[1])

# get all the items that the user has already rated
single_user_ratings_np = user_item_matrix_np[
    [user_id_encoded]
]
users_rated_items_np = (single_user_ratings_np > 0).nonzero()[1]
users_rated_items_decoded = item_encoder.inverse_transform(users_rated_items_np)

np.testing.assert_array_equal(
    users_rated_items_decoded, user_rated_items_pd.index.to_numpy(), verbose=True
)

similar_users_user_item_matrix_np = user_item_matrix_np[similar_users_np]

any_ratings = np.nonzero(similar_users_user_item_matrix_np.sum(axis=0))[0]
items_to_use = np.setdiff1d(any_ratings, users_rated_items_np)

filtered_user_item_matrix_np = similar_users_user_item_matrix_np[:, items_to_use]

display(filtered_user_item_matrix_pd)

tmp_pd = pd.DataFrame(
    filtered_user_item_matrix_np.toarray().T,
    columns=user_encoder.inverse_transform(similar_users_np),
    index=item_encoder.inverse_transform(items_to_use),
)
display(tmp_pd)
print(filtered_user_item_matrix_pd.mean(axis=1).sort_values(ascending=False))
print(tmp_pd.mean(axis=1).sort_values(ascending=False))

item_mean_ratings = filtered_user_item_matrix_np.toarray().T.mean(axis=1)
item_sorter = np.argsort(1 - item_mean_ratings, kind="stable")

user_recs_np1 = item_encoder.inverse_transform(items_to_use[item_sorter][:10])

np.testing.assert_array_equal(user_recs_pd1, user_recs_np1)

user_id,U00002,U00004,U00001,U00005,U00006,U00012
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
I00001,0.0,0.0,1.0,0.0,0.0,0.5
I00002,1.0,0.0,0.5,0.0,0.0,0.5
I00007,0.0,0.25,0.0,0.5,0.5,0.0
I00008,0.0,0.0,0.0,0.25,0.5,0.0
I00009,0.0,0.0,0.0,0.0,0.25,0.0
I00012,0.0,0.0,0.0,0.0,0.0,1.0


Unnamed: 0,U00002,U00004,U00001,U00005,U00006,U00012
I00001,0.0,0.0,1.0,0.0,0.0,0.5
I00002,1.0,0.0,0.5,0.0,0.0,0.5
I00007,0.0,0.25,0.0,0.5,0.5,0.0
I00008,0.0,0.0,0.0,0.25,0.5,0.0
I00009,0.0,0.0,0.0,0.0,0.25,0.0
I00012,0.0,0.0,0.0,0.0,0.0,1.0


item_id
I00002    0.333333
I00001    0.250000
I00007    0.208333
I00012    0.166667
I00008    0.125000
I00009    0.041667
dtype: float64
I00002    0.333333
I00001    0.250000
I00007    0.208333
I00012    0.166667
I00008    0.125000
I00009    0.041667
dtype: float32


In [17]:
# produce the same results uisng the library (np implementation) - they should be the same

rec_np = UserBasedRecommender(10, 10)

rec_np.fit(user_item_matrix_np)

# Temporary
similar_users_sp = rec_np._get_similar_users(user_id_encoded)

np.testing.assert_array_equal(
    similar_users_sp,
    user_encoder.transform(similar_users_pd.index.to_numpy()),
)