In [157]:
%%capture
%load_ext autoreload
%autoreload 1

In [158]:
import pandas as pd
import numpy as np
import matplotlib

%matplotlib inline

In [159]:
user_item_interactions_path = "../tests/test_data/user_item_interactions.csv"
data_types = {"user_id": str, "item_id": str, "count": np.int64}

user_item_interactions_daily_df = pd.read_csv(
    user_item_interactions_path, dtype=data_types, parse_dates=["date"], engine="python"
)

user_item_interactions_df = (
    user_item_interactions_daily_df[["user_id", "item_id", "count"]]
    .groupby(["user_id", "item_id"])
    .sum()
    .sort_values(by="count", ascending=False)
    .reset_index()
)
user_item_interactions_df.head(3)

Unnamed: 0,user_id,item_id,count
0,U000662,I00005060,11
1,U001462,I00009604,10
2,U003780,I00009683,9


In [160]:
user_interactions_df = (
    user_item_interactions_df[["user_id", "count"]]
    .groupby(["user_id"])
    .sum()
    .sort_values(by="count", ascending=False)
    .reset_index()
)
user_interactions_df.head(3)

Unnamed: 0,user_id,count
0,U004863,146
1,U001804,110
2,U003638,74


In [161]:
item_interactions_df = (
    user_item_interactions_df[["item_id", "count"]]
    .groupby(["item_id"])
    .sum()
    .sort_values(by="count", ascending=False)
    .reset_index()
)
item_interactions_df.head(3)

Unnamed: 0,item_id,count
0,I00005060,12
1,I00009604,10
2,I00008211,9


In [162]:
user_item_interaction_counts = (
    user_item_interactions_df["count"].value_counts().to_frame("user_item")
)
user_interaction_counts = user_interactions_df["count"].value_counts().to_frame("user")
item_interaction_counts = item_interactions_df["count"].value_counts().to_frame("item")

all_counts = pd.concat(
    [user_item_interaction_counts, user_interaction_counts, item_interaction_counts], axis=1
).fillna(0).astype(np.int64).sort_index()

all_counts.head(10)

Unnamed: 0_level_0,user_item,user,item
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,8700,3162,8345
2,975,955,1058
3,215,413,242
4,65,202,78
5,22,105,27
6,12,60,12
7,2,39,4
8,5,47,7
9,2,29,2
10,1,13,1


In [163]:
unique_users = (
    user_item_interactions_df[["user_id"]]
    .drop_duplicates()
    .sort_values("user_id")
    .reset_index(drop=True)
)
num_users = unique_users.shape[0]
print(f"num_users: {num_users}")

num_users: 5148


In [164]:
user_id_value_counts = user_item_interactions_df.user_id.value_counts()
user_id_low_counts_percent = (
    (user_id_value_counts.value_counts() / num_users * 100)
    .round(1)
    .to_frame(name="percent")
)
print("% of users with the lowest number of interactions.")
user_id_low_counts_percent.head(5)

% of users with the lowest number of interactions.


Unnamed: 0_level_0,percent
count,Unnamed: 1_level_1
1,71.1
2,15.8
3,5.3
4,2.4
5,1.0


In [165]:
user_id_multiple_interaction = user_id_value_counts[user_id_value_counts >= 2].index.to_series().reset_index(drop=True)
train_test_users = set(user_id_multiple_interaction)
user_id_multiple_interaction.head(5)

0    U004863
1    U001804
2    U003638
3    U004663
4    U005008
Name: user_id, dtype: object

In [166]:
train_test_interactions_df = user_item_interactions_daily_df[user_item_interactions_daily_df.user_id.isin(train_test_users)].sort_values(by=["user_id", "date"])
print(train_test_interactions_df.shape)
train_test_interactions_df.head(5)

(6339, 4)


Unnamed: 0,user_id,item_id,date,count
4186,U000001,I00001622,2024-07-02,1
8503,U000001,I00001645,2024-07-02,1
9170,U000001,I00003796,2024-07-02,1
831,U000004,I00008609,2024-07-02,2
5443,U000004,I00002186,2024-07-02,1


In [167]:
test_data_df = (
    train_test_interactions_df
    .reset_index()
    .groupby(["user_id"], as_index=False)
    .last()
    .set_index("index"))[["user_id", "item_id"]]
test_data_df.index.names = [None]
    
print(test_data_df.shape)
test_data_df.head(5)

(1487, 2)


Unnamed: 0,user_id,item_id
9170,U000001,I00003796
9741,U000004,I00006770
9624,U000005,I00003179
7350,U000013,I00008490
7234,U000014,I00002663


In [168]:
train_data_df = train_test_interactions_df[
    (
        train_test_interactions_df[["user_id", "item_id"]]
        .merge(
            test_data_df.drop_duplicates(),
            on=["user_id", "item_id"],
            how="left",
            indicator=True,
        )
        ._merge
        == "left_only"
    ).values
]

print(train_data_df.shape)
train_data_df.head(5)

(4852, 4)


Unnamed: 0,user_id,item_id,date,count
4186,U000001,I00001622,2024-07-02,1
8503,U000001,I00001645,2024-07-02,1
831,U000004,I00008609,2024-07-02,2
5443,U000004,I00002186,2024-07-02,1
7004,U000004,I00005251,2024-07-02,1


In [169]:
indices = test_data_df.index.union(train_data_df.index)
excluded_data_df = user_item_interactions_daily_df.loc[
    ~user_item_interactions_daily_df.index.isin(indices)
]

print(excluded_data_df.shape)
excluded_data_df.head(5)

(3661, 4)


Unnamed: 0,user_id,item_id,date,count
0,U002120,I00006767,2024-07-02,1
3,U002062,I00000098,2024-07-02,1
4,U001214,I00009416,2024-07-02,1
6,U000356,I00006347,2024-07-02,1
11,U004656,I00008241,2024-07-02,3


In [170]:
# sense check
assert test_data_df.index.intersection(train_data_df.index).shape[0] == 0
assert test_data_df.index.intersection(excluded_data_df.index).shape[0] == 0
assert train_data_df.index.intersection(excluded_data_df.index).shape[0] == 0
assert (
    test_data_df.index.union(train_data_df.index.union(excluded_data_df.index)).shape[0]
    == user_item_interactions_daily_df.index.shape[0]
)

In [171]:
# sum counts of user/item interactions to form rating

train_data_ratings_df = (
    train_data_df[["user_id", "item_id", "count"]]
    .groupby(["user_id", "item_id"])
    .sum()
    .reset_index()
    .rename(columns={"count": "rating"})
)
# train_data_ratings_df["rating"] = 1 + np.log10(train_data_ratings_df["rating"])
train_data_ratings_df["rating"] = (
    train_data_ratings_df["rating"] / train_data_ratings_df["rating"].max()
).round(2)

print(train_data_ratings_df.shape)
train_data_ratings_df.head(5)

(4852, 3)


Unnamed: 0,user_id,item_id,rating
0,U000001,I00001622,0.12
1,U000001,I00001645,0.12
2,U000004,I00001053,0.12
3,U000004,I00002186,0.12
4,U000004,I00005251,0.12


In [172]:
train_data_ratings_df.rating.value_counts()

rating
0.12    4269
0.25     448
0.38      85
0.50      29
0.62      11
0.75       7
1.00       2
0.88       1
Name: count, dtype: int64

In [173]:
unique_train_users = (
    train_data_ratings_df[["user_id"]]
    .drop_duplicates()
    .sort_values("user_id")
    .reset_index(drop=True)
)
unique_train_items = (
    train_data_ratings_df[["item_id"]]
    .drop_duplicates()
    .sort_values("item_id")
    .reset_index(drop=True)
)
num_train_users = unique_train_users.shape[0]
num_train_items = unique_train_items.shape[0]
print(f"num_train_users: {num_train_users}")
print(f"num_train_items: {num_train_items}")

num_train_users: 1487
num_train_items: 4811


In [174]:
from pipeliner.recommendations.transformer import (
    UserItemMatrixTransformerPandas,
    SimilarityTransformerPandas,
)
from pipeliner.recommendations.recommender import UserBasedRecommenderPandas

In [175]:
user_item_matrix_transformer = UserItemMatrixTransformerPandas()
user_item_matrix = user_item_matrix_transformer.transform(train_data_ratings_df)
user_item_matrix.iloc[:5, :5]

item_id,I00000001,I00000002,I00000005,I00000007,I00000008
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
U000001,0.0,0.0,0.0,0.0,0.0
U000004,0.0,0.0,0.0,0.0,0.0
U000005,0.0,0.0,0.0,0.0,0.0
U000013,0.0,0.0,0.0,0.0,0.0
U000014,0.0,0.0,0.0,0.0,0.0


In [176]:
user_similarity_transformer = SimilarityTransformerPandas(
    kind="user", metric="cosine", round=1, normalise=True
)
user_similarity_matrix = user_similarity_transformer.transform(user_item_matrix)
print(user_similarity_matrix.shape)
user_similarity_matrix.iloc[:5, :5]

(1487, 1487)


user_id,U000001,U000004,U000005,U000013,U000014
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
U000001,1.0,0.0,0.0,0.0,0.0
U000004,0.0,1.0,0.0,0.0,0.0
U000005,0.0,0.0,1.0,0.0,0.0
U000013,0.0,0.0,0.0,1.0,0.0
U000014,0.0,0.0,0.0,0.0,1.0


In [177]:
# the vast majority of similarity scores are 0.0
# and almost all the scores of 1.0 are for users
# compared with themselves in the matrix
# making it difficult to generate any recommendations at all
user_similarity_rating_counts = user_similarity_matrix.stack().value_counts()

assert user_similarity_rating_counts[1.0] >= num_train_users

user_similarity_rating_counts

0.0    2209608
1.0       1495
0.1         28
0.2         20
0.3          8
0.6          4
0.5          2
0.4          2
0.7          2
Name: count, dtype: int64

In [183]:
useful_user_similarities = pd.DataFrame(
    np.count_nonzero(
        (user_similarity_matrix > 0.0) & (user_similarity_matrix < 1.0), axis=1
    ),
    index=user_similarity_matrix.index,
    columns=["count"],
)
useful_user_similarities.value_counts()


useful_user_similarity_counts = useful_user_similarities.value_counts()

print(
    f"useful_user_similarities: {(useful_user_similarities["count"].sum() / num_train_users * 100).round(2)}%"
)
useful_user_similarity_counts

useful_user_similarities: 4.44


count
0        1424
1          60
2           3
Name: count, dtype: int64

In [179]:
item_similarity_transformer = SimilarityTransformerPandas(
    kind="item", metric="cosine", round=1, normalise=True
)
item_similarity_matrix = item_similarity_transformer.transform(user_item_matrix)
print(item_similarity_matrix.shape)
item_similarity_matrix.iloc[:5, :5]

(4811, 4811)


item_id,I00000001,I00000002,I00000005,I00000007,I00000008
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
I00000001,1.0,0.0,0.0,0.0,0.0
I00000002,0.0,1.0,0.0,0.0,0.0
I00000005,0.0,0.0,1.0,0.0,0.0
I00000007,0.0,0.0,0.0,1.0,0.0
I00000008,0.0,0.0,0.0,0.0,1.0


In [180]:
item_similarity_rating_counts = item_similarity_matrix.stack().value_counts()

assert item_similarity_rating_counts[1.0] >= num_train_items

item_similarity_rating_counts

0.0    23057232
1.0       87025
0.7         988
0.4         352
0.9         114
0.3           6
0.5           4
Name: count, dtype: int64

In [184]:
useful_item_similarities = pd.DataFrame(
    np.count_nonzero(
        (item_similarity_matrix > 0.0) & (item_similarity_matrix < 1.0), axis=1
    ),
    index=item_similarity_matrix.index,
    columns=["count"],
)
useful_item_similarity_counts = useful_item_similarities.value_counts()

print(
    f"useful_item_similarities: {(useful_item_similarities["count"].sum() / num_train_items * 100).round(2)}%"
)
useful_item_similarity_counts

useful_item_similarities: 30.43%


count
0        4122
1         579
2          77
11          4
12          3
21          3
7           2
4           2
17          2
19          2
9           2
44          1
23          1
43          1
57          1
70          1
30          1
13          1
14          1
10          1
6           1
5           1
3           1
158         1
Name: count, dtype: int64