In [98]:
import pandas as pd
import numpy as np
from IPython.display import display

In [99]:
user_item_interactions = pd.read_csv(
    "../../tests/test_data/user_item_interactions_30_days.csv", parse_dates=["date"]
)
user_item_interactions.shape

(1827358, 4)

In [100]:
display(user_item_interactions.head())

Unnamed: 0,user_id,item_id,date,interactions
0,U007714,I00372373,2024-09-08,1.0
1,U007714,I00605528,2024-09-08,1.0
2,U013522,I01182960,2024-09-08,2.0
3,U008840,I01218693,2024-09-08,1.0
4,U002036,I00704228,2024-09-08,1.0


In [101]:
# group by user_id and item_id and 
# sum the number of interactions

user_item_interactions_grouped = user_item_interactions.drop(columns=["date"]).groupby(
    ["user_id", "item_id"]
).sum().reset_index()
user_item_interactions_grouped.head(5)

Unnamed: 0,user_id,item_id,interactions
0,U000003,I00037925,1.0
1,U000003,I00189384,1.0
2,U000003,I00256366,1.0
3,U000003,I00267268,3.0
4,U000003,I00298191,1.0


In [102]:
# cap the number of interactions
# to produce a better range of values 
# to create ratings from
INTERACTION_CAP = 5
user_item_interactions_grouped["interactions_capped"] = np.minimum(
    user_item_interactions_grouped.interactions, INTERACTION_CAP
)
user_item_interactions_grouped["interactions_capped"].value_counts().sort_index()

interactions_capped
1.0    1163526
2.0     253237
3.0      68800
4.0      55743
5.0      33575
Name: count, dtype: int64

In [112]:
# scale the capped interactions to form a rating

user_item_ratings = user_item_interactions_grouped[["user_id", "item_id"]].copy()

user_item_ratings["rating"] = 1 + np.log1p(
    user_item_interactions_grouped["interactions_capped"]
)
user_item_ratings["rating"] = (
    user_item_ratings["rating"] / user_item_ratings["rating"].max()
).round(2)
user_item_ratings.head()

Unnamed: 0,user_id,item_id,rating
0,U000003,I00037925,0.61
1,U000003,I00189384,0.61
2,U000003,I00256366,0.61
3,U000003,I00267268,0.85
4,U000003,I00298191,0.61


In [None]:
# store mean ratings for later use
user_ratings_mean = user_item_ratings.groupby("user_id")["rating"].mean()
item_ratings_mean = user_item_ratings.groupby("item_id")["rating"].mean()

display(user_ratings_mean.head())
display(item_ratings_mean.head())

user_id
U000003    0.677826
U000004    0.794000
U000005    0.630000
U000006    0.661538
U000007    0.692917
Name: rating, dtype: float64

In [138]:
# how many unique users and items are there?
unique_user_counts = user_item_ratings.groupby("user_id")["rating"].count().rename("rating_count")
unique_item_counts = (
    user_item_ratings.groupby("item_id")["rating"].count().rename("rating_count")
)

print("users", unique_user_counts.shape[0])
print("items", unique_item_counts.shape[0])

user_rating_counts = (
    unique_user_counts.value_counts().sort_index().to_frame("n_users").reset_index()
)
item_rating_counts = (
    unique_item_counts.value_counts().sort_index().to_frame("n_items").reset_index()
)

assert user_rating_counts.n_users.sum() == unique_user_counts.shape[0]
assert item_rating_counts.n_items.sum() == unique_item_counts.shape[0]

display(user_rating_counts.head())
display(item_rating_counts.head())

users 62719
items 792547


Unnamed: 0,rating_count,n_users
0,1,9933
1,2,6298
2,3,4794
3,4,3954
4,5,3094


Unnamed: 0,rating_count,n_items
0,1,532283
1,2,124846
2,3,50916
3,4,26752
4,5,16044
