In [24]:
%%capture
%load_ext autoreload
%autoreload 1

In [None]:
import pandas as pd
import numpy as np
from pipeliner.recommendations.transformer import (
    SimilarityTransformerPandas,
)
from pipeliner.recommendations.recommender import ItemBasedRecommenderPandas

In [None]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv("../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,1.0
1,U00001,I00002,0.5
2,U00001,I00003,0.5
3,U00002,I00002,1.0
4,U00002,I00003,0.5


In [None]:
user_item_matrix = (
    user_item_ratings.groupby(["user_id", "item_id"])["rating"]
    .agg("sum")
    .unstack()
    .fillna(0.0)
)
user_item_matrix.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U00001,1.0,0.5,0.5,0.0,0.0,0.0
U00002,0.0,1.0,0.5,0.5,0.0,0.0
U00003,0.0,0.0,1.0,0.5,0.5,0.0
U00004,0.0,0.0,0.0,1.0,0.5,0.5
U00005,0.5,0.0,0.0,0.0,1.0,0.5


In [28]:
similarity_matrix_transformer =  SimilarityTransformerPandas(kind="item", metric="cosine", normalise=True)
similarity_matrix = similarity_matrix_transformer.transform(user_item_matrix)
similarity_matrix.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
I00001,1.0,0.5,0.333333,0.0,0.333333,0.5
I00002,0.5,1.0,0.5,0.333333,0.0,0.333333
I00003,0.333333,0.5,1.0,0.5,0.333333,0.0
I00004,0.0,0.333333,0.5,1.0,0.5,0.333333
I00005,0.333333,0.0,0.333333,0.5,1.0,0.5


Generate recommendations based on item similarity

In [29]:
item_id = "I00003"

item_recommendations = (
    similarity_matrix[item_id]
    .drop(item_id, errors="ignore")
    .sort_values(ascending=False)
)
# double check the item isn't in the recommendations
# this should be an empty series
assert item_id not in item_recommendations.index

item_recs_1 = item_recommendations.head(5).index.to_list()
item_recs_1

['I00002', 'I00004', 'I00001', 'I00005', 'I00006']

results should be the same with the library

In [30]:
rec = ItemBasedRecommenderPandas(5)

rec.fit(similarity_matrix)
item_recs_2 = rec.predict([item_id]).tolist()[0]
print(item_recs_2)

assert set(item_recs_1) == set(item_recs_2)

['I00002', 'I00004', 'I00001', 'I00005', 'I00006']


Generate recommendations based on item similarity, excluding user's past items

In [31]:
user_id = "U00003"

single_user_matrix = user_item_matrix.loc[user_id]
user_rated_items = single_user_matrix[single_user_matrix > 0].sort_values(
    ascending=False
)
print("user_rated_items", user_rated_items.index.to_list())

user_item_recommendations = (
    similarity_matrix[item_id]
    .drop([item_id] + user_rated_items.index.to_list(), errors="ignore")
    .sort_values(ascending=False)
)

# double check the user hasn't rated the items
# this should be an empty series
assert (
    set(user_rated_items.index.to_list()).intersection(
        set(user_item_recommendations.index.to_list())
    )
    == set()
)

user_item_recs_1 = user_item_recommendations.head(5).index.to_list()
user_item_recs_1

user_rated_items ['I00003', 'I00004', 'I00005']


['I00002', 'I00001', 'I00006']

In [32]:
rec2 = ItemBasedRecommenderPandas(5)

rec2.fit((similarity_matrix, user_item_matrix))
user_item_recs_2 = rec2.predict([(item_id, user_id)]).tolist()[0]

assert set(user_item_recs_1) == set(user_item_recs_2)

user_item_recs_2

['I00002', 'I00001', 'I00006']