In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.preprocessing import LabelEncoder


from pipeliner.recommendations.transformer import (
    UserItemMatrixTransformer,
    SimilarityTransformer,
    UserItemMatrixTransformerNP,
    SimilarityTransformerNP,
)
from pipeliner.recommendations.recommender import SimilarityRecommender#, SimilarityRecommenderNP


In [3]:
user_item_ratings_df = pd.read_csv(
        "../../tests/test_data/user_item_ratings_toy.csv",
        dtype={"user_id": str, "item_id": str, "rating": np.float32},
        header=0,
    )
print(user_item_ratings_df.shape)
user_item_ratings_df.head(3)

(18, 3)


Unnamed: 0,user_id,item_id,rating
0,U00001,I00001,1.0
1,U00001,I00002,0.5
2,U00001,I00003,0.5


In [4]:
# encode the user and item ids
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_item_ratings_encoded = user_item_ratings_df.copy() 

user_item_ratings_encoded["user_id"] = user_encoder.fit_transform(user_item_ratings_encoded["user_id"])
user_item_ratings_encoded["item_id"] = item_encoder.fit_transform(user_item_ratings_encoded["item_id"])

user_item_ratings_np = user_item_ratings_encoded.to_numpy().astype(np.float32)

unique_users = pd.Series(user_encoder.classes_)
unique_items = pd.Series(item_encoder.classes_)

print(unique_users.shape[0], unique_items.shape[0])
user_item_ratings_encoded.head(3)

6 6


Unnamed: 0,user_id,item_id,rating
0,0,0,1.0
1,0,1,0.5
2,0,2,0.5


In [5]:
user_item_matrix_toy = pd.read_csv(
        "../../tests/test_data/user_item_matrix_toy.csv",
        header=0,
        index_col=['user_id'],
    ).astype(np.float32)
user_item_matrix_toy

Unnamed: 0_level_0,I00001,I00002,I00003,I00004,I00005,I00006
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U00001,1.0,0.5,0.5,0.0,0.0,0.0
U00002,0.0,1.0,0.5,0.5,0.0,0.0
U00003,0.0,0.0,1.0,0.5,0.5,0.0
U00004,0.0,0.0,0.0,1.0,0.5,0.5
U00005,0.5,0.0,0.0,0.0,1.0,0.5
U00006,0.5,0.5,0.0,0.0,0.0,1.0


In [6]:
item_similarity_matrix_toy = pd.read_csv(
        "../../tests/test_data/item_similarity_matrix_toy.csv",
        header=0,
        index_col=['item_id'],
    ).astype(np.float32)
item_similarity_matrix_toy

Unnamed: 0_level_0,I00001,I00002,I00003,I00004,I00005,I00006
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
I00001,1.0,0.5,0.333333,0.0,0.333333,0.5
I00002,0.5,1.0,0.5,0.333333,0.0,0.333333
I00003,0.333333,0.5,1.0,0.5,0.333333,0.0
I00004,0.0,0.333333,0.5,1.0,0.5,0.333333
I00005,0.333333,0.0,0.333333,0.5,1.0,0.5
I00006,0.5,0.333333,0.0,0.333333,0.5,1.0


In [7]:
user_similarity_matrix_toy = pd.read_csv(
        "../../tests/test_data/user_similarity_matrix_toy.csv",
        header=0,
        index_col=['user_id'],
    ).astype(np.float32)
user_similarity_matrix_toy

Unnamed: 0_level_0,U00001,U00002,U00003,U00004,U00005,U00006
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
U00001,1.0,0.5,0.333333,0.0,0.333333,0.5
U00002,0.5,1.0,0.5,0.333333,0.0,0.333333
U00003,0.333333,0.5,1.0,0.5,0.333333,0.0
U00004,0.0,0.333333,0.5,1.0,0.5,0.333333
U00005,0.333333,0.0,0.333333,0.5,1.0,0.5
U00006,0.5,0.333333,0.0,0.333333,0.5,1.0


In [8]:
# pandas implementation
user_item_matrix_transformer = UserItemMatrixTransformer()
user_item_matrix = user_item_matrix_transformer.transform(user_item_ratings_df)

assert user_item_matrix.shape == user_item_matrix_toy.shape
assert user_item_matrix.index.equals(user_item_matrix_toy.index)
assert user_item_matrix.columns.equals(user_item_matrix_toy.columns)
np.testing.assert_array_equal(
    user_item_matrix.to_numpy(), 
    user_item_matrix_toy.to_numpy(),
)

In [9]:
# numpy implementation
user_item_matrix_transformer_np = UserItemMatrixTransformerNP()
user_item_matrix_np = user_item_matrix_transformer_np.transform(user_item_ratings_np)

assert user_item_matrix_np.shape == user_item_matrix_toy.shape
np.testing.assert_array_equal(
    user_item_matrix_np.toarray(), 
    user_item_matrix_toy.to_numpy(),
)

In [10]:
# pandas implementation
item_similarity_matrix_transformer =  SimilarityTransformer(kind="item", metric="cosine", normalise=False)
item_similarity_matrix = item_similarity_matrix_transformer.transform(user_item_matrix)

assert item_similarity_matrix.shape == item_similarity_matrix_toy.shape
assert item_similarity_matrix.index.equals(item_similarity_matrix_toy.index)
assert item_similarity_matrix.columns.equals(item_similarity_matrix_toy.columns)
np.testing.assert_array_equal(
    item_similarity_matrix.to_numpy(), 
    item_similarity_matrix_toy.to_numpy()
)

In [11]:
# numpy implementation
item_similarity_matrix_transformer_np =  SimilarityTransformerNP()
item_similarity_matrix_np = item_similarity_matrix_transformer_np.transform(user_item_matrix_np.T)

assert item_similarity_matrix_np.shape == item_similarity_matrix.shape
np.testing.assert_array_equal(
    item_similarity_matrix_np.toarray().astype(np.float32).round(6), 
    item_similarity_matrix.to_numpy().astype(np.float32).round(6)
)

In [12]:
# pandas implementation
user_similarity_matrix_transformer =  SimilarityTransformer(kind="user", metric="cosine", normalise=False)
user_similarity_matrix = user_similarity_matrix_transformer.transform(user_item_matrix)

assert user_similarity_matrix.shape == user_similarity_matrix_toy.shape
assert user_similarity_matrix.index.equals(user_similarity_matrix_toy.index)
assert user_similarity_matrix.columns.equals(user_similarity_matrix_toy.columns)
np.testing.assert_array_equal(
    user_similarity_matrix.to_numpy().astype(np.float32).round(6), 
    user_similarity_matrix_toy.to_numpy().astype(np.float32).round(6)
)

In [13]:
# numpy implementation
user_similarity_matrix_transformer_np =  SimilarityTransformerNP()
user_similarity_matrix_np = user_similarity_matrix_transformer_np.transform(user_item_matrix_np)

assert user_similarity_matrix_np.shape == user_similarity_matrix.shape
np.testing.assert_array_equal(
    user_similarity_matrix_np.toarray().astype(np.float32).round(6), 
    user_similarity_matrix.to_numpy().astype(np.float32).round(6)
)

In [14]:
item_ids = ['I00001', 'I00002', 'I00003', 'I00004', 'I00005', 'I00006']
rec_pd = SimilarityRecommender(5)
rec_pd.fit(item_similarity_matrix)
item_recs_pd = rec_pd.predict(item_ids)
print(item_recs_pd)

[array(['I00002', 'I00006', 'I00003', 'I00005'], dtype=object)
 array(['I00001', 'I00003', 'I00004', 'I00006'], dtype=object)
 array(['I00002', 'I00004', 'I00001', 'I00005'], dtype=object)
 array(['I00003', 'I00005', 'I00002', 'I00006'], dtype=object)
 array(['I00006', 'I00004', 'I00001', 'I00003'], dtype=object)
 array(['I00001', 'I00005', 'I00002', 'I00004'], dtype=object)]


In [82]:
item_ids_encoded = item_encoder.transform(item_ids)

item_similarity_matrix_np_dense = item_similarity_matrix_np.toarray()

for item_id, item_similarity in zip(item_ids_encoded, item_similarity_matrix_np_dense):
    print(item_id, item_similarity)
    mask = item_similarity > 0
    mask[item_id] = False
    sorter = np.argsort(item_similarity)[::-1]
    print('sorter', sorter)
    sorted_mask = mask[sorter]
    print('sorted_mask', sorted_mask)
    results = sorter[sorted_mask]
    print(results)
    print(item_encoder.inverse_transform(results))
    
# print(item_similarity_matrix_np.shape)
# item_similarity_matrix_np[mask].toarray()
# rec_np = SimilarityRecommenderNP(5)
# rec_np.fit(item_similarity_matrix_np)
# item_recs_np = rec_np.predict(item_ids_encoded)
# print(item_recs_np)

0 [1.       0.5      0.333333 0.       0.333333 0.5     ]
sorter [0 5 1 4 2 3]
sorted_mask [False  True  True  True  True False]
[5 1 4 2]
['I00006' 'I00002' 'I00005' 'I00003']
1 [0.5      1.       0.5      0.333333 0.       0.333333]
sorter [1 2 0 5 3 4]
sorted_mask [False  True  True  True  True False]
[2 0 5 3]
['I00003' 'I00001' 'I00006' 'I00004']
2 [0.333333 0.5      1.       0.5      0.333333 0.      ]
sorter [2 3 1 4 0 5]
sorted_mask [False  True  True  True  True False]
[3 1 4 0]
['I00004' 'I00002' 'I00005' 'I00001']
3 [0.       0.333333 0.5      1.       0.5      0.333333]
sorter [3 4 2 5 1 0]
sorted_mask [False  True  True  True  True False]
[4 2 5 1]
['I00005' 'I00003' 'I00006' 'I00002']
4 [0.333333 0.       0.333333 0.5      1.       0.5     ]
sorter [4 5 3 2 0 1]
sorted_mask [False  True  True  True  True False]
[5 3 2 0]
['I00006' 'I00004' 'I00003' 'I00001']
5 [0.5      0.333333 0.       0.333333 0.5      1.      ]
sorter [5 4 0 3 1 2]
sorted_mask [False  True  True  True