In [1]:
%%capture
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from recsys_pipeliner.recommendations.transformer import (
    SimilarityTransformer,
    UserItemMatrixTransformer,
)
from recsys_pipeliner.recommendations.recommender import SimilarityRecommender, ItemBasedRecommender

In [3]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv("../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,0.8
1,U00001,I00002,0.4
2,U00001,I00003,0.6
3,U00001,I00004,0.2
4,U00001,I00005,1.0


In [4]:
user_item_matrix = (
    user_item_ratings.groupby(["user_id", "item_id"])["rating"]
    .agg("sum")
    .unstack()
    .fillna(0.0)
)
user_item_matrix.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,...,I00015,I00016,I00017,I00018,I00019,I00020,I00021,I00022,I00023,I00024
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,0.8,0.4,0.6,0.2,1.0,0.8,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00002,0.0,0.0,0.81,0.41,0.61,0.21,1.0,0.81,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,0.0,0.0,0.82,0.42,0.62,0.22,1.0,0.82,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,0.0,0.0,0.0,0.83,0.43,0.63,0.23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84,0.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
matrix = user_item_matrix.T
df = pd.DataFrame(
    cosine_similarity(matrix),
    index=matrix.index,
    columns=matrix.index,
)
similarity_matrix = (df - df.min()) / (df.max() - df.min()).round(6).astype(np.float32)

similarity_matrix.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,...,I00015,I00016,I00017,I00018,I00019,I00020,I00021,I00022,I00023,I00024
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
I00001,1.0,0.952514,0.572359,0.54081,0.382097,0.471175,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.406511,0.324324,0.603794,0.423087
I00002,0.952514,1.0,0.375152,0.344515,0.270936,0.334099,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.518846,0.413948,0.5802,0.387949
I00003,0.572359,0.375152,1.0,0.942391,0.538899,0.49357,0.396185,0.489965,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420939,0.33608
I00004,0.54081,0.344515,0.942391,1.0,0.309239,0.260624,0.279725,0.345938,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.534312,0.426598
I00005,0.382097,0.270936,0.538899,0.309239,1.0,0.959376,0.543216,0.500196,0.395505,0.490569,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Generate recommendations based on item similarity

In [6]:
item_id = "I00003"

item_recommendations = (
    similarity_matrix[item_id]
    .drop(item_id, errors="ignore")
    .sort_values(ascending=False, kind="stable")
)
# double check the item isn't in the recommendations
# this should be an empty series
assert item_id not in item_recommendations.index

item_recs_1 = item_recommendations.head(5).index.to_list()
item_recs_1

['I00004', 'I00001', 'I00005', 'I00006', 'I00008']

results should be the same with the SimilarityRecommender from the library

In [7]:
user_item_ratings_np = user_item_ratings.copy()

In [8]:
# encode the item ids
item_encoder = LabelEncoder()
user_encoder = LabelEncoder()

user_item_ratings_np["item_id"] = item_encoder.fit_transform(
    user_item_ratings_np["item_id"]
)
user_item_ratings_np["user_id"] = user_encoder.fit_transform(
    user_item_ratings_np["user_id"]
)

user_item_ratings_np.head(3)


Unnamed: 0,user_id,item_id,rating
0,0,0,0.8
1,0,1,0.4
2,0,2,0.6


In [9]:
user_item_matrix_transformer = UserItemMatrixTransformer()

user_item_matrix_np = user_item_matrix_transformer.transform(
    user_item_ratings_np.to_numpy(),
)


# check ratings from matrix are correct
users = user_item_ratings_np["user_id"].to_numpy().astype(int)
items = user_item_ratings_np["item_id"].to_numpy().astype(int)
ratings = user_item_ratings_np["rating"].to_numpy().astype(np.float32)
for user, item, rating in zip(users, items, ratings):
    assert user_item_matrix_np[user, item] == rating

user_item_matrix_np.shape

(12, 24)

In [10]:
similarity_matrix_transformer = SimilarityTransformer()
item_similarity_matrix_np = similarity_matrix_transformer.transform(
    user_item_matrix_np.T
)

item_similarity_matrix_np.shape


(24, 24)

In [11]:
item_idx = item_encoder.transform([item_id])

similarity_recommender = SimilarityRecommender(5)
similarity_recommender.fit(item_similarity_matrix_np)
recommendatons = similarity_recommender.recommend(item_idx)

item_recs_2 = [
    item_encoder.inverse_transform(item).tolist() for item in recommendatons
][0]

assert item_id not in item_recs_2
np.testing.assert_array_equal(item_recs_1, item_recs_2)

item_recs_2

['I00004', 'I00001', 'I00005', 'I00006', 'I00008']

results should be the same with the ItemBasedRecommender from the library

In [12]:
item_based_recommender = ItemBasedRecommender(5)
item_based_recommender.fit(user_item_matrix_np)
recommendatons = item_based_recommender.recommend(item_idx)

item_recs_3 = [
    item_encoder.inverse_transform(item).tolist() for item in recommendatons
][0]

assert item_id not in item_recs_3
np.testing.assert_array_equal(item_recs_1, item_recs_3)

item_recs_3


['I00004', 'I00001', 'I00005', 'I00006', 'I00008']