In [61]:
%%capture
%load_ext autoreload
%autoreload 1

In [62]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from pipeliner.recommendations.recommender import ItemBasedRecommenderPandas

In [63]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv("../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,1.0
1,U00001,I00002,0.5
2,U00001,I00003,0.5
3,U00001,I00004,0.25
4,U00002,I00002,1.0


In [64]:
user_item_matrix = (
    user_item_ratings.groupby(["user_id", "item_id"])["rating"]
    .agg("sum")
    .unstack()
    .fillna(0.0)
)
user_item_matrix.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00002,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0,0.0
U00005,0.0,0.0,0.0,0.0,1.0,0.5,0.5,0.25,0.0,0.0,0.0,0.0


In [65]:
matrix = user_item_matrix.T
df = pd.DataFrame(
    cosine_similarity(matrix),
    index=matrix.index,
    columns=matrix.index,
)
similarity_matrix = (df - df.min()) / (df.max() - df.min()).round(6).astype(np.float32)

similarity_matrix.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
I00001,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4,0.56
I00002,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16,0.4
I00003,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0,0.16
I00004,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0,0.0
I00005,0.0,0.16,0.4,0.56,1.0,0.56,0.4,0.16,0.0,0.0,0.0,0.0


Generate recommendations based on item similarity

In [66]:
item_id = "I00003"

item_recommendations = (
    similarity_matrix[item_id]
    .drop(item_id, errors="ignore")
    .sort_values(ascending=False, kind="stable")
)
# double check the item isn't in the recommendations
# this should be an empty series
assert item_id not in item_recommendations.index

item_recs_1 = item_recommendations.head(5).index.to_list()
item_recs_1

['I00002', 'I00004', 'I00001', 'I00005', 'I00006']

results should be the same with the library

In [67]:
rec = ItemBasedRecommenderPandas(5)

rec.fit(similarity_matrix)
item_recs_2 = rec.predict([item_id])[0]
print(item_recs_2)

assert set(item_recs_1) == set(item_recs_2)

['I00002' 'I00004' 'I00001' 'I00005' 'I00006']
