In [76]:
import pandas as pd
import numpy as np
import matplotlib

%matplotlib inline

In [77]:
user_item_interactions_path = "../tests/test_data/user_item_interactions.csv"
data_types = {"user_id": str, "item_id": str, "count": np.int64}

user_item_interactions_daily_df = pd.read_csv(
    user_item_interactions_path, dtype=data_types, parse_dates=["date"], engine="python"
)

user_item_interactions_df = (
    user_item_interactions_daily_df[["user_id", "item_id", "count"]]
    .groupby(["user_id", "item_id"])
    .sum()
    .sort_values(by="count", ascending=False)
    .reset_index()
)
user_item_interactions_df.head(3)

Unnamed: 0,user_id,item_id,count
0,U000662,I00005060,11
1,U001462,I00009604,10
2,U003780,I00009683,9


In [78]:
user_interactions_df = (
    user_item_interactions_df[["user_id", "count"]]
    .groupby(["user_id"])
    .sum()
    .sort_values(by="count", ascending=False)
    .reset_index()
)
user_interactions_df.head(3)

Unnamed: 0,user_id,count
0,U004863,146
1,U001804,110
2,U003638,74


In [79]:
item_interactions_df = (
    user_item_interactions_df[["item_id", "count"]]
    .groupby(["item_id"])
    .sum()
    .sort_values(by="count", ascending=False)
    .reset_index()
)
item_interactions_df.head(3)

Unnamed: 0,item_id,count
0,I00005060,12
1,I00009604,10
2,I00008211,9


In [80]:
user_item_interaction_counts = (
    user_item_interactions_df["count"].value_counts().to_frame("user_item")
)
user_interaction_counts = user_interactions_df["count"].value_counts().to_frame("user")
item_interaction_counts = item_interactions_df["count"].value_counts().to_frame("item")

all_counts = pd.concat(
    [user_item_interaction_counts, user_interaction_counts, item_interaction_counts], axis=1
).fillna(0).astype(np.int64).sort_index()

all_counts.head(10)

Unnamed: 0_level_0,user_item,user,item
count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,8700,3162,8345
2,975,955,1058
3,215,413,242
4,65,202,78
5,22,105,27
6,12,60,12
7,2,39,4
8,5,47,7
9,2,29,2
10,1,13,1


In [81]:
unique_users = (
    user_item_interactions_df[["user_id"]]
    .drop_duplicates()
    .sort_values("user_id")
    .reset_index(drop=True)
)
num_users = unique_users.size
print(f"num_users: {num_users}")

num_users: 5148


In [82]:
user_id_value_counts = user_item_interactions_df.user_id.value_counts()
user_id_low_counts_percent = (user_id_value_counts.value_counts() / num_users * 100).round(1).to_frame(name="percent")
print("% of users with the lowest number of interactions.")
user_id_low_counts_percent.head(10)

% of users with the lowest number of interactions.


Unnamed: 0_level_0,percent
count,Unnamed: 1_level_1
1,71.1
2,15.8
3,5.3
4,2.4
5,1.0
7,0.8
6,0.7
8,0.5
10,0.3
12,0.2


In [83]:
user_id_multiple_interaction = user_id_value_counts[user_id_value_counts > 1].index.to_series().reset_index(drop=True)
train_test_users = set(user_id_multiple_interaction)
user_id_multiple_interaction.head(5)

0    U004863
1    U001804
2    U003638
3    U004663
4    U005008
Name: user_id, dtype: object

In [84]:
train_test_interactions_df = user_item_interactions_daily_df[user_item_interactions_daily_df.user_id.isin(train_test_users)].sort_values(by=["user_id", "date"])
print(train_test_interactions_df.shape)
train_test_interactions_df.head(5)

(6339, 4)


Unnamed: 0,user_id,item_id,date,count
4186,U000001,I00001622,2024-07-02,1
8503,U000001,I00001645,2024-07-02,1
9170,U000001,I00003796,2024-07-02,1
831,U000004,I00008609,2024-07-02,2
5443,U000004,I00002186,2024-07-02,1


In [85]:
test_data_df = (
    train_test_interactions_df
    .reset_index()
    .groupby(["user_id"], as_index=False)
    .last()
    .set_index("index"))[["user_id", "item_id"]]
test_data_df.index.names = [None]
    
print(test_data_df.shape)
test_data_df.head(5)

(1487, 2)


Unnamed: 0,user_id,item_id
9170,U000001,I00003796
9741,U000004,I00006770
9624,U000005,I00003179
7350,U000013,I00008490
7234,U000014,I00002663


In [86]:
train_data_df = train_test_interactions_df[(train_test_interactions_df[["user_id", "item_id"]].merge(
    test_data_df.drop_duplicates(), 
    on=["user_id", "item_id"],
    how="left",
    indicator=True
)._merge == "left_only").values]

train_data_df

Unnamed: 0,user_id,item_id,date,count
4186,U000001,I00001622,2024-07-02,1
8503,U000001,I00001645,2024-07-02,1
831,U000004,I00008609,2024-07-02,2
5443,U000004,I00002186,2024-07-02,1
7004,U000004,I00005251,2024-07-02,1
...,...,...,...,...
6132,U005139,I00008285,2024-07-02,5
745,U005142,I00004981,2024-07-02,8
2708,U005142,I00006788,2024-07-02,4
1559,U005143,I00002966,2024-07-02,3


In [87]:
indices = test_data_df.index.union(train_data_df.index)
excluded_data_df = train_test_interactions_df.loc[~train_test_interactions_df.index.isin(indices)]
excluded_data_df

Unnamed: 0,user_id,item_id,date,count


In [88]:
# sense check
assert test_data_df.index.intersection(train_data_df.index).shape[0] == 0
assert test_data_df.index.intersection(excluded_data_df.index).shape[0] == 0
assert train_data_df.index.intersection(excluded_data_df.index).shape[0] == 0
assert test_data_df.index.union(train_data_df.index.union(excluded_data_df.index)).shape[0] == train_test_interactions_df.index.shape[0]


In [89]:
# sum counts of user/item interactions to form rating

train_data_ratings_df = (
    train_data_df[["user_id", "item_id", "count"]]
        .groupby(["user_id", "item_id"])
        .sum()
        .reset_index()
        .rename(columns={"count": "rating"})
)
train_data_ratings_df["rating"] = 1 + np.log10(train_data_ratings_df["rating"])
train_data_ratings_df["rating"] = (
    train_data_ratings_df["rating"] / train_data_ratings_df["rating"].max()
).round(2)
train_data_ratings_df.head(5)

Unnamed: 0,user_id,item_id,rating
0,U000001,I00001622,0.53
1,U000001,I00001645,0.53
2,U000004,I00001053,0.53
3,U000004,I00002186,0.53
4,U000004,I00005251,0.53
