In [1]:
%%capture
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from pipeliner.recommendations.transformer import (
    SimilarityTransformer,
    UserItemMatrixTransformer,
)
from pipeliner.recommendations.recommender import SimilarityRecommender

In [3]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv("../../tests/test_data/user_item_ratings_toy.csv", dtype=data_types)
user_item_ratings.head(5)

Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,0.8
1,U00001,I00002,0.4
2,U00001,I00003,0.6
3,U00001,I00004,0.2
4,U00001,I00005,1.0


In [4]:
user_item_matrix = (
    user_item_ratings.groupby(["user_id", "item_id"])["rating"]
    .agg("sum")
    .unstack()
    .fillna(0.0)
)
user_item_matrix.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,0.8,0.4,0.6,0.2,1.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0
U00002,0.0,0.81,0.41,0.61,0.21,1.0,0.81,0.0,0.0,0.0,0.0,0.0
U00003,0.0,0.0,0.82,0.42,0.62,0.22,1.0,0.82,0.0,0.0,0.0,0.0
U00004,0.0,0.0,0.0,0.83,0.43,0.63,0.23,1.0,0.83,0.0,0.0,0.0
U00005,0.0,0.0,0.0,0.0,0.84,0.44,0.64,0.24,1.0,0.84,0.0,0.0


In [5]:
matrix = user_item_matrix.T
df = pd.DataFrame(
    cosine_similarity(matrix),
    index=matrix.index,
    columns=matrix.index,
)
similarity_matrix = (df - df.min()) / (df.max() - df.min()).round(6).astype(np.float32)

similarity_matrix.head(5)

item_id,I00001,I00002,I00003,I00004,I00005,I00006,I00007,I00008,I00009,I00010,I00011,I00012
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
I00001,1.0,0.642846,0.505271,0.411597,0.396416,0.205515,0.0,0.238351,0.401951,0.407988,0.516017,0.65564
I00002,0.642846,1.0,0.632022,0.495627,0.384169,0.365501,0.210151,0.0,0.243258,0.40828,0.418528,0.528675
I00003,0.505271,0.632022,1.0,0.623209,0.473609,0.351557,0.374707,0.216567,0.0,0.250214,0.417999,0.432617
I00004,0.411597,0.495627,0.623209,1.0,0.595231,0.455078,0.359464,0.378332,0.219781,0.0,0.253462,0.421489
I00005,0.396416,0.384169,0.473609,0.595231,1.0,0.581411,0.45966,0.36375,0.378209,0.220812,0.0,0.254197


Generate recommendations based on item similarity

In [6]:
item_id = "I00003"

item_recommendations = (
    similarity_matrix[item_id]
    .drop(item_id, errors="ignore")
    .sort_values(ascending=False, kind="stable")
)
# double check the item isn't in the recommendations
# this should be an empty series
assert item_id not in item_recommendations.index

item_recs_1 = item_recommendations.head(5).index.to_list()
item_recs_1

['I00002', 'I00004', 'I00001', 'I00005', 'I00012']

results should be the same with the library

In [7]:
user_item_ratings_np = user_item_ratings.copy()

In [8]:
# encode the item ids
item_encoder = LabelEncoder()
user_encoder = LabelEncoder()

user_item_ratings_np["item_id"] = item_encoder.fit_transform(
    user_item_ratings_np["item_id"]
)
user_item_ratings_np["user_id"] = user_encoder.fit_transform(
    user_item_ratings_np["user_id"]
)

user_item_ratings_np.head(3)


Unnamed: 0,user_id,item_id,rating
0,0,0,0.8
1,0,1,0.4
2,0,2,0.6


In [9]:
user_item_matrix_transformer = UserItemMatrixTransformer()

user_item_matrix_np = user_item_matrix_transformer.transform(
    user_item_ratings_np.to_numpy(),
)


# check ratings from matrix are correct
users = user_item_ratings_np["user_id"].to_numpy().astype(int)
items = user_item_ratings_np["item_id"].to_numpy().astype(int)
ratings = user_item_ratings_np["rating"].to_numpy().astype(np.float32)
for user, item, rating in zip(users, items, ratings):
    assert user_item_matrix_np[user, item] == rating

user_item_matrix_np.shape

(12, 12)

In [10]:
similarity_matrix_transformer = SimilarityTransformer()
item_similarity_matrix_np = similarity_matrix_transformer.transform(
    user_item_matrix_np.T
)

item_similarity_matrix_np.shape


(12, 12)

In [11]:
item_idx = item_encoder.transform([item_id])

similarity_recommender = SimilarityRecommender(5)
similarity_recommender.fit(item_similarity_matrix_np)
recommendatons = similarity_recommender.recommend(item_idx)

item_recs_2 = [
    item_encoder.inverse_transform(item).tolist() for item in recommendatons
][0]

assert item_id not in item_recs_2
np.testing.assert_array_equal(item_recs_1, item_recs_2)

item_recs_2


['I00002', 'I00004', 'I00001', 'I00005', 'I00012']