In [286]:
%%capture
%load_ext autoreload
%autoreload 1

In [287]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from pipeliner.recommendations.transformer import (
    UserItemMatrixTransformer,
    UserItemMatrixTransformerPandas,
    SimilarityTransformerPandas,
    SimilarityTransformer,
)
from pipeliner.recommendations.recommender import UserBasedRecommenderPandas, UserBasedRecommender

In [288]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv(f"../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,1.0
1,U00001,I00002,0.5
2,U00001,I00003,0.5
3,U00002,I00002,1.0
4,U00002,I00003,0.5


In [289]:
user_item_matrix_transformer_pd = UserItemMatrixTransformerPandas()
user_item_matrix_pd = user_item_matrix_transformer_pd.transform(user_item_ratings)
user_item_matrix_pd.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U00001,1.0,0.5,0.5,0.0,0.0,0.0
U00002,0.0,1.0,0.5,0.5,0.0,0.0
U00003,0.0,0.0,1.0,0.5,0.5,0.0
U00004,0.0,0.0,0.0,1.0,0.5,0.5
U00005,0.5,0.0,0.0,0.0,1.0,0.5


In [290]:
user_similarity_transformer_pd = SimilarityTransformerPandas(kind="user", metric="cosine", normalise=True)
user_similarity_matrix_pd = user_similarity_transformer_pd.transform(user_item_matrix_pd)
user_similarity_matrix_pd.head(5)

user_id,U00001,U00002,U00003,U00004,U00005,U00006
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U00001,1.0,0.5,0.333333,0.0,0.333333,0.5
U00002,0.5,1.0,0.5,0.333333,0.0,0.333333
U00003,0.333333,0.5,1.0,0.5,0.333333,0.0
U00004,0.0,0.333333,0.5,1.0,0.5,0.333333
U00005,0.333333,0.0,0.333333,0.5,1.0,0.5


Calcuate the recommendations manually in Pandas.

In [291]:
# get the 5 most similar users
user_id = "U00003"

similar_users_pd = (user_similarity_matrix_pd[user_id]
                 .drop(user_id, errors="ignore")
                 .sort_values(ascending=False))
similar_users_pd = similar_users_pd[similar_users_pd > 0].head(5)
similar_users_pd

user_id
U00002    0.500000
U00004    0.500000
U00001    0.333333
U00005    0.333333
Name: U00003, dtype: float32

In [292]:
# get all the items that the user has already rated
single_user_matrix_pd = user_item_matrix_pd.loc[user_id]
user_rated_items_pd = single_user_matrix_pd[single_user_matrix_pd > 0].sort_values(
    ascending=False
)
user_rated_items_pd.index.to_list()

['I00003', 'I00004', 'I00005']

In [293]:
# get all ratings by similar users
# exclude items that the users has alrady rated
# sort by rating

matrix = user_item_matrix_pd.T[similar_users_pd.head(5).index]
user_recommendations = (
    matrix[
        ~matrix.index.isin(user_rated_items_pd.index) & (matrix > 0).any(axis="columns")
    ]
    .max(axis=1)
    .sort_values(ascending=False)
)

# double check the user hasn't rated the items
# this should be an empty series
assert (
    set(user_rated_items_pd.index.to_list()).intersection(
        set(user_recommendations.index.to_list())
    )
    == set()
)

# get the top 5
user_recs_1 = user_recommendations.head(5).index.to_list()
user_recs_1

['I00001', 'I00002', 'I00006']

In [294]:
# produce the same results uisng the library (Pandas implementation) - they should be the same

rec_pandas = UserBasedRecommenderPandas(5, 5)

rec_pandas.fit((user_similarity_matrix_pd, user_item_matrix_pd))
user_recs_pd = rec_pandas.predict([user_id]).tolist()[0]

assert set(user_recs_1) == set(user_recs_pd)

user_recs_1

['I00001', 'I00002', 'I00006']

Numpy/Scipy implementation

In [295]:
# encode the user and item ids
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_item_ratings_encoded = user_item_ratings.copy() 

user_item_ratings_encoded["user_id"] = user_encoder.fit_transform(user_item_ratings_encoded["user_id"])
user_item_ratings_encoded["item_id"] = item_encoder.fit_transform(user_item_ratings_encoded["item_id"])

user_item_ratings_np = user_item_ratings_encoded.to_numpy().astype(np.float32)

unique_users = pd.Series(user_encoder.classes_)
unique_items = pd.Series(item_encoder.classes_)

print(unique_users.shape[0], unique_items.shape[0])
user_item_ratings_encoded.head(3)

6 6


Unnamed: 0,user_id,item_id,rating
0,0,0,1.0
1,0,1,0.5
2,0,2,0.5


In [296]:
user_item_matrix_transformer_np = UserItemMatrixTransformer()
user_item_matrix_np = user_item_matrix_transformer_np.transform(user_item_ratings_np)

assert isinstance(user_item_matrix_np, sp.sparse.spmatrix)

In [297]:
# sense check
display(pd.DataFrame(user_item_matrix_np.toarray(), columns=unique_items, index=unique_users))
display(user_item_matrix_pd)

np.testing.assert_array_equal(user_item_matrix_np.toarray(), user_item_matrix_pd.to_numpy())

Unnamed: 0,I00001,I00002,I00003,I00004,I00005,I00006
U00001,1.0,0.5,0.5,0.0,0.0,0.0
U00002,0.0,1.0,0.5,0.5,0.0,0.0
U00003,0.0,0.0,1.0,0.5,0.5,0.0
U00004,0.0,0.0,0.0,1.0,0.5,0.5
U00005,0.5,0.0,0.0,0.0,1.0,0.5
U00006,0.5,0.5,0.0,0.0,0.0,1.0


item_id,I00001,I00002,I00003,I00004,I00005,I00006
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U00001,1.0,0.5,0.5,0.0,0.0,0.0
U00002,0.0,1.0,0.5,0.5,0.0,0.0
U00003,0.0,0.0,1.0,0.5,0.5,0.0
U00004,0.0,0.0,0.0,1.0,0.5,0.5
U00005,0.5,0.0,0.0,0.0,1.0,0.5
U00006,0.5,0.5,0.0,0.0,0.0,1.0


In [298]:
user_similarity_transformer_np = SimilarityTransformer()
user_similarity_matrix_np = user_similarity_transformer_np.transform(user_item_matrix_np).astype(np.float32)

assert isinstance(user_similarity_matrix_np, sp.sparse.spmatrix)

In [299]:
# sense check
display(
    pd.DataFrame(
        user_similarity_matrix_np.toarray(), columns=unique_users, index=unique_users
    )
)
display(user_similarity_matrix_pd)

np.testing.assert_array_equal(
    user_similarity_matrix_np.toarray().round(6),
    user_similarity_matrix_pd.to_numpy().round(6),
)


Unnamed: 0,U00001,U00002,U00003,U00004,U00005,U00006
U00001,1.0,0.5,0.333333,0.0,0.333333,0.5
U00002,0.5,1.0,0.5,0.333333,0.0,0.333333
U00003,0.333333,0.5,1.0,0.5,0.333333,0.0
U00004,0.0,0.333333,0.5,1.0,0.5,0.333333
U00005,0.333333,0.0,0.333333,0.5,1.0,0.5
U00006,0.5,0.333333,0.0,0.333333,0.5,1.0


user_id,U00001,U00002,U00003,U00004,U00005,U00006
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U00001,1.0,0.5,0.333333,0.0,0.333333,0.5
U00002,0.5,1.0,0.5,0.333333,0.0,0.333333
U00003,0.333333,0.5,1.0,0.5,0.333333,0.0
U00004,0.0,0.333333,0.5,1.0,0.5,0.333333
U00005,0.333333,0.0,0.333333,0.5,1.0,0.5
U00006,0.5,0.333333,0.0,0.333333,0.5,1.0


In [302]:
# get the 5 most similar users
user_id_encoded = user_encoder.transform([user_id])[0]
print(user_id_encoded, user_id)

single_user_similarity_matrix_np = user_similarity_matrix_np[user_id_encoded].toarray()
user_mask = (single_user_similarity_matrix_np > 0) * (np.arange(single_user_similarity_matrix_np.size) != user_id_encoded)
user_sorter = np.argsort(1 - single_user_similarity_matrix_np, kind="stable")
sorted_user_mask = user_mask[0, user_sorter]
similar_users_np = user_sorter[sorted_user_mask][: 5]

print("similar users", similar_users_np, user_encoder.inverse_transform(similar_users_np))

np.testing.assert_array_equal(
    user_encoder.inverse_transform(similar_users_np),
    similar_users_pd.index.to_numpy(),
)

# get all the items that the user has already rated
single_user_matrix_np = user_item_matrix_np[
    user_id_encoded
]  # .toarray()[0] # can't seem to avoid this

user_rated_items_np = np.nonzero(single_user_matrix_np > 0)[1]
user_rated_items_decoded = item_encoder.inverse_transform(user_rated_items_np)

np.testing.assert_array_equal(
    user_rated_items_decoded, user_rated_items_pd.index.to_numpy(), verbose=True
)

print("user's rated items", user_rated_items_np, user_rated_items_decoded)

# get all ratings by similar users
# exclude items that the users has alrady rated
# sort by rating

#display(user_item_matrix_pd.iloc[similar_users_np, ~user_rated_items_np])

display(user_item_matrix_pd.iloc[similar_users_np, ~user_item_matrix_pd.columns.isin(user_rated_items_pd.index)])

# need to mask the user rated items
similar_users_user_item_matrix_np = user_item_matrix_np[similar_users_np]
user_indices = np.arange(user_item_matrix_np.shape[0])
item_indices = np.arange(user_item_matrix_np.shape[1])

unrated_item_mask = ~np.isin(item_indices, user_rated_items_np)

print("user_indices", user_indices, user_mask[0], np.sort(similar_users_np))
print("item_indices", item_indices, unrated_item_mask, user_rated_items_np)

item_mean_ratings = (
    user_item_matrix_np[user_mask[0], :][:, unrated_item_mask].toarray().mean(axis=0)
)
item_sorter = np.argsort(1 - item_mean_ratings, kind="stable")

print("item_mean_ratings", item_mean_ratings)
print("item_sorter", item_sorter)
print("sorted items", item_mean_ratings[item_sorter])


# double check the user hasn't rated the items
# this should be an empty series

# get the top 5


2 U00003
similar users [1 3 0 4] ['U00002' 'U00004' 'U00001' 'U00005']
user's rated items [2 3 4] ['I00003' 'I00004' 'I00005']


item_id,I00001,I00002,I00006
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
U00002,0.0,1.0,0.0
U00004,0.0,0.0,0.5
U00001,1.0,0.5,0.0
U00005,0.5,0.0,0.5


user_indices [0 1 2 3 4 5] [ True  True False  True  True False] [0 1 3 4]
item_indices [0 1 2 3 4 5] [ True  True False False False  True] [2 3 4]
item_mean_ratings [0.375 0.375 0.25 ]
item_sorter [0 1 2]
sorted items [0.375 0.375 0.25 ]


In [301]:
# produce the same results uisng the library (np implementation) - they should be the same

rec_np = UserBasedRecommender(5, 5)

rec_np.fit(user_item_matrix_np)
# user_recs_np = rec_np.predict([user_id]).tolist()[0]

# assert set(user_recs_1) == set(user_recs_np)

# user_recs_np