In [29]:
%%capture
%load_ext autoreload
%autoreload 1

In [30]:
import pandas as pd
import numpy as np
from pipeliner.recommendations.transformer import (
    UserItemMatrixTransformer,
    SimilarityTransformer,
)
from pipeliner.recommendations.recommender import UserBasedRecommender

In [31]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv(f"../tests/test_data/user_item_ratings.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U1000,I1063,0.36
1,U1001,I1062,0.79
2,U1001,I1099,0.43
3,U1002,I1043,0.64
4,U1002,I1013,0.43


In [32]:
user_item_matrix_transformer = UserItemMatrixTransformer()
user_item_matrix = user_item_matrix_transformer.transform(user_item_ratings)
user_item_matrix.head(5)

item_id,I1003,I1006,I1010,I1013,I1016,I1017,I1019,I1020,I1021,I1025,...,I1074,I1075,I1076,I1077,I1082,I1091,I1093,I1097,I1098,I1099
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43
U1002,0.0,0.0,0.0,0.43,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1003,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1004,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.71,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
similarity_transformer =  SimilarityTransformer(kind="user", metric="cosine", normalise=True)
similarity_matrix = similarity_transformer.transform(user_item_matrix)
similarity_matrix.head(5)

user_id,U1000,U1001,U1002,U1003,U1004,U1005,U1006,U1007,U1008,U1009
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
U1000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1002,0.0,0.0,1.0,0.0,0.0,0.14403,0.0,0.0,0.0,0.204233
U1003,0.0,0.0,0.0,1.0,0.0,0.100523,0.0,0.0,0.0,0.0
U1004,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.292944,0.0


In [34]:
user_id = "U1002"
user_similarity_threshold = 0.1

similar_users = (similarity_matrix[similarity_matrix[user_id]>user_similarity_threshold][user_id]
                 .drop(user_id, errors="ignore")
                 .sort_values(ascending=False))
similar_users.head(5)

user_id
U1009    0.204233
U1005    0.144030
Name: U1002, dtype: float64

In [35]:
single_user_matrix = user_item_matrix.loc[user_id]
user_rated_items = single_user_matrix[single_user_matrix > 0].sort_values(
    ascending=False
)
user_rated_items.index.to_list()

['I1043', 'I1051', 'I1013']

In [36]:
matrix = user_item_matrix.T[similar_users.head(5).index]
user_recommendations = (
    matrix[
        ~matrix.index.isin(user_rated_items.index) & (matrix > 0).any(axis="columns")
    ]
    .max(axis=1)
    .sort_values(ascending=False)
)

# double check the user hasn't rated the items
# this should be an empty series
assert (
    set(user_rated_items.index.to_list()).intersection(
        set(user_recommendations.index.to_list())
    )
    == set()
)

user_recs_1 = user_recommendations.head(5).index.to_list()
user_recs_1

['I1072', 'I1029', 'I1017', 'I1075', 'I1057']

In [37]:
rec = UserBasedRecommender(5, 5, 0.1)

rec.fit((similarity_matrix, user_item_matrix))
user_recs_2 = rec.predict([user_id]).tolist()[0]

assert set(user_recs_1) == set(user_recs_2)

user_recs_1

['I1072', 'I1029', 'I1017', 'I1075', 'I1057']