In [43]:
import pandas as pd
import numpy as np
import matplotlib

In [44]:
user_item_interactions_path = "../tests/test_data/user_item_interactions.csv"
data_types = {"user_id": str, "item_id": str, "count": np.float64}

user_item_interactions_df = pd.read_csv(
    user_item_interactions_path, dtype=data_types, parse_dates=["date"], engine="python"
)
user_item_interactions_df.head(3)

Unnamed: 0,user_id,item_id,date,count
0,U000160,I00180663,2024-07-21,1.0
1,U003885,I00568991,2024-07-21,1.0
2,U048327,I00645036,2024-07-21,1.0


In [45]:
unique_users = (
    user_item_interactions_df[["user_id"]]
    .drop_duplicates()
    .sort_values("user_id")
    .reset_index(drop=True)
)
num_users = unique_users.size
print(f"num_users: {num_users}")

num_users: 65450


In [46]:
# how many users have viewed less than 5 reports?
user_id_value_counts = user_item_interactions_df.user_id.value_counts()
user_id_low_counts_percent = (user_id_value_counts[user_id_value_counts <=5].value_counts() / num_users * 100).round(1).to_frame(name="percent")
user_id_low_counts_percent
print("lowest user interaction counts")
user_id_low_counts_percent

lowest user interaction counts


Unnamed: 0_level_0,percent
count,Unnamed: 1_level_1
1,15.6
2,9.6
3,7.1
4,5.9
5,4.8


In [47]:
user_id_one_interaction = user_id_value_counts[user_id_value_counts == 1].index.to_series().reset_index(drop=True)
user_id_multiple_interaction = user_id_value_counts[user_id_value_counts > 1].index.to_series().reset_index(drop=True)
train_test_users = set(user_id_multiple_interaction)
user_id_multiple_interaction.head(5)

0    U061181
1    U058603
2    U009029
3    U058601
4    U021942
Name: user_id, dtype: object

In [48]:
train_test_interactions_df = user_item_interactions_df[user_item_interactions_df.user_id.isin(train_test_users)].sort_values(by=["user_id", "date"])
print(train_test_interactions_df.shape)
train_test_interactions_df.head(5)

(1810267, 4)


Unnamed: 0,user_id,item_id,date,count
1372115,U000001,I00240761,2024-06-03,1.0
80173,U000001,I00675851,2024-06-13,1.0
516136,U000001,I00583892,2024-06-13,1.0
921402,U000001,I00695137,2024-06-13,1.0
1380058,U000001,I00034062,2024-06-13,1.0


In [49]:
test_data_df = (
    train_test_interactions_df
    .reset_index()
    .groupby(["user_id"], as_index=False)
    .last()
    .set_index("index"))[["user_id", "item_id"]]
test_data_df.index.names = [None]
    
print(test_data_df.shape)
test_data_df.head(5)

(55237, 2)


Unnamed: 0,user_id,item_id
1352958,U000001,I00799344
1124376,U000002,I00015013
1107055,U000003,I00014289
773287,U000004,I00173512
368580,U000005,I00773045


In [50]:
train_data_df = train_test_interactions_df[(train_test_interactions_df[["user_id", "item_id"]].merge(
    test_data_df.drop_duplicates(), 
    on=["user_id", "item_id"],
    how="left",
    indicator=True
)._merge == "left_only").values]

train_data_df

Unnamed: 0,user_id,item_id,date,count
1372115,U000001,I00240761,2024-06-03,1.0
80173,U000001,I00675851,2024-06-13,1.0
516136,U000001,I00583892,2024-06-13,1.0
921402,U000001,I00695137,2024-06-13,1.0
1380058,U000001,I00034062,2024-06-13,1.0
...,...,...,...,...
242735,U065450,I00787585,2024-06-19,1.0
699221,U065450,I00081583,2024-06-19,1.0
167671,U065450,I00575585,2024-07-11,1.0
10265,U065450,I00728960,2024-07-17,1.0


In [51]:
indices = test_data_df.index.union(train_data_df.index)
excluded_data_df = train_test_interactions_df.loc[~train_test_interactions_df.index.isin(indices)]
excluded_data_df

Unnamed: 0,user_id,item_id,date,count
10575,U000002,I00015013,2024-07-03,1.0
360947,U000013,I00090110,2024-07-17,1.0
1381297,U000013,I00090110,2024-07-21,1.0
387737,U000025,I00108382,2024-07-17,1.0
68007,U000039,I00262060,2024-06-19,1.0
...,...,...,...,...
752217,U065322,I00451649,2024-07-23,1.0
1516601,U065326,I00552570,2024-07-23,1.0
710573,U065331,I00654286,2024-07-23,1.0
952763,U065359,I00385576,2024-07-23,1.0


In [52]:
# sense check
assert test_data_df.index.intersection(train_data_df.index).shape[0] == 0
assert test_data_df.index.intersection(excluded_data_df.index).shape[0] == 0
assert train_data_df.index.intersection(excluded_data_df.index).shape[0] == 0
assert test_data_df.index.union(train_data_df.index.union(excluded_data_df.index)).shape[0] == train_test_interactions_df.index.shape[0]


In [53]:
# sum counts of user/item interactions to form rating

train_data_ratings_df = (
    train_data_df[["user_id", "item_id", "count"]]
        .groupby(["user_id", "item_id"])
        .sum()
        .reset_index()
        .rename(columns={"count": "rating"})
)
train_data_ratings_df["rating"] = 1 + np.log10(train_data_ratings_df["rating"])
train_data_ratings_df["rating"] = (
    train_data_ratings_df["rating"] / train_data_ratings_df["rating"].max()
).round(2)
train_data_ratings_df.head(5)

Unnamed: 0,user_id,item_id,rating
0,U000001,I00034062,0.38
1,U000001,I00036931,0.38
2,U000001,I00240761,0.38
3,U000001,I00474940,0.38
4,U000001,I00520152,0.38
