In [1]:
%%capture
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
import numpy as np

In [3]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv(f"../tests/test_data/test_user_item_ratings.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U1000,I1063,0.36
1,U1001,I1062,0.79
2,U1001,I1099,0.43
3,U1002,I1043,0.64
4,U1002,I1013,0.43


In [4]:
from sklearn.pipeline import Pipeline

from pipeliner.recommendations.transformer import (
    UserItemMatrixTransformer,
    SimilarityTransformer,
)

transformer = Pipeline(
    [
        ("user_item", UserItemMatrixTransformer()),
        ("similarity", SimilarityTransformer(kind="user", metric="cosine", normalise=True)),
    ]
)

similarity_matrix = transformer.transform(user_item_ratings)
similarity_matrix.head(5)

user_id,U1000,U1001,U1002,U1003,U1004,U1005,U1006,U1007,U1008,U1009
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
U1000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U1002,0.0,0.0,1.0,0.0,0.0,0.14403,0.0,0.0,0.0,0.204233
U1003,0.0,0.0,0.0,1.0,0.0,0.100523,0.0,0.0,0.0,0.0
U1004,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.292944,0.0


In [5]:
user_id = "U1002"
user_similarity_threshold = 0.1

similar_users = (similarity_matrix[similarity_matrix[user_id]>user_similarity_threshold][user_id]
                 .drop(user_id, errors="ignore")
                 .sort_values(ascending=False))
similar_users.head(5)

user_id
U1009    0.204233
U1005    0.144030
Name: U1002, dtype: float64

In [6]:
user_items = user_item_ratings[user_item_ratings["user_id"] == user_id]["item_id"].drop_duplicates()
user_items.head(5)

3    I1043
4    I1013
5    I1051
Name: item_id, dtype: object

In [7]:
similar_users_items = (user_item_ratings[user_item_ratings["user_id"].isin(similar_users.index) & (~user_item_ratings["item_id"].isin(user_items))]
    .sort_values(by="rating", ascending=False).drop_duplicates(subset="item_id", keep="first")[["item_id", "rating"]])
similar_users_items

Unnamed: 0,item_id,rating
45,I1072,1.0
16,I1029,0.93
53,I1075,0.79
54,I1017,0.79
19,I1057,0.64
46,I1071,0.64
52,I1076,0.64
17,I1077,0.57
50,I1037,0.5
15,I1069,0.43


In [8]:
# double check the user hasn't rated the items
# this should be an empty series
similar_users_items[similar_users_items["item_id"].isin(user_items)]

Unnamed: 0,item_id,rating


In [9]:
user_cf_recommendations = similar_users_items["item_id"].head(5).values
user_cf_recommendations

array(['I1072', 'I1029', 'I1075', 'I1017', 'I1057'], dtype=object)