In [168]:
%%capture
%load_ext autoreload
%autoreload 1

In [169]:
import pandas as pd
import numpy as np

In [170]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv(f"../tests/test_data/test_user_item_ratings.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U1000,I1063,0.36
1,U1001,I1062,0.79
2,U1001,I1099,0.43
3,U1002,I1043,0.64
4,U1002,I1013,0.43


In [171]:
from sklearn.pipeline import Pipeline

from pipeliner.recommendations.transformer import (
    UserItemMatrixTransformer,
    SimilarityTransformer,
)

transformer = Pipeline(
    [
        ("user_item", UserItemMatrixTransformer()),
        ("similarity", SimilarityTransformer(kind="item", metric="cosine", normalise=True)),
    ]
)

similarity_matrix = transformer.transform(user_item_ratings)
similarity_matrix.head(5)

item_id,I1003,I1006,I1010,I1013,I1016,I1017,I1019,I1020,I1021,I1025,...,I1074,I1075,I1076,I1077,I1082,I1091,I1093,I1097,I1098,I1099
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
I1003,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.762141,0.647411,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.762141,0.0,0.0
I1006,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
I1010,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I1013,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.758185,0.0,0.0,0.0,0.0,0.0,0.0
I1016,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [172]:
user_id = "U1002"

user_favourite_items = (
    user_item_ratings[user_item_ratings["user_id"] == user_id]
    .drop(columns="user_id")
    .sort_values(by="rating", ascending=False)
    .head(5))
user_favourite_items

Unnamed: 0,item_id,rating
3,I1043,0.64
5,I1051,0.64
4,I1013,0.43


In [173]:
threshold = 0.1

favourites = user_favourite_items["item_id"].values


similar_items = (
    similarity_matrix[favourites][~similarity_matrix.index.isin(favourites)]
    .max(axis=1)
    .sort_values(ascending=False)
)


similar_items.head(10)

item_id
I1077    0.758185
I1029    0.758185
I1057    0.720395
I1074    0.707107
I1066    0.707107
I1017    0.707107
I1071    0.707107
I1076    0.707107
I1075    0.707107
I1037    0.700277
dtype: float64

In [174]:
# double check the user hasn't rated the items
# this should be an empty df
user_item_ratings[(user_item_ratings["user_id"] == user_id) & user_item_ratings["item_id"].isin(similar_items.index)]

Unnamed: 0,user_id,item_id,rating


In [177]:
user_cf_recommendations = similar_items.head(10).index
user_cf_recommendations

Index(['I1077', 'I1029', 'I1057', 'I1074', 'I1066', 'I1017', 'I1071', 'I1076',
       'I1075', 'I1037'],
      dtype='object', name='item_id')