In [None]:
import pandas as pd

In [None]:
# Set the path to the data directory
data_path = "../data/KuaiRec/data/"

# Load main user-item interaction matrix
print("Loading big matrix...")
big_matrix = pd.read_csv(data_path + "big_matrix.csv")

# Load a smaller matrix for testing/validation
print("Loading small matrix...")
small_matrix = pd.read_csv(data_path + "small_matrix.csv")

# Load social network data and convert friend_list from string to list
print("Loading social network...")
social_network = pd.read_csv(data_path + "social_network.csv")
social_network["friend_list"] = social_network["friend_list"].map(eval)

# Load item features and convert feat from string to list
print("Loading item features...")
item_categories = pd.read_csv(data_path + "item_categories.csv")
item_categories["feat"] = item_categories["feat"].map(eval)

# Load user features
print("Loading user features...")
user_features = pd.read_csv(data_path + "user_features.csv")

# Load item daily features
print("Loading items' daily features...")
item_daily_features = pd.read_csv(data_path + "item_daily_features.csv")

print("All data loaded.")

Loading big matrix...
Loading small matrix...
Loading social network...
Loading item features...
Loading user features...
Loading items' daily features...
All data loaded.


In [None]:
# Prepare the interaction matrix for training
# Select relevant columns and filter out high watch_ratio outliers
interaction_matrix = big_matrix[["user_id", "video_id", "watch_ratio"]]
interaction_matrix = interaction_matrix[interaction_matrix['watch_ratio'] <= 3]
# Normalize watch_ratio to [0, 1]
interaction_matrix['watch_ratio'] = (
    (interaction_matrix['watch_ratio'] - interaction_matrix['watch_ratio'].min()) /
    (interaction_matrix['watch_ratio'].max() - interaction_matrix['watch_ratio'].min())
)
# Optionally, you could use log1p normalization (currently commented out)
# interaction_matrix["watch_ratio"] = np.log1p(interaction_matrix["watch_ratio"])

# Prepare the test matrix in the same way
test_matrix = small_matrix[["user_id", "video_id", "watch_ratio"]]
test_matrix = test_matrix[test_matrix['watch_ratio'] <= 3]
test_matrix['watch_ratio'] = (
    (test_matrix['watch_ratio'] - test_matrix['watch_ratio'].min()) /
    (test_matrix['watch_ratio'].max() - test_matrix['watch_ratio'].min())
)
# Optionally, you could use log1p normalization (currently commented out)
# test_matrix["watch_ratio"] = np.log1p(test_matrix["watch_ratio"])


In [None]:
# Prepare user features
# Use one-hot encoded features, fill missing values, and shift indices for embedding compatibility
onehot_cols = [f"onehot_feat{i}" for i in range(18)]
users = user_features[["user_id"] + onehot_cols].copy()
users[onehot_cols] = users[onehot_cols].fillna(-1).astype(int) + 1
users["user_id"] = users["user_id"].astype(int)


In [None]:
# Prepare video/item features
# Helper function to shift and pad tag lists for consistent input length
def shift_and_pad(tags, max_len=31):
    shifted = [tag + 1 for tag in tags]
    padded = shifted[:max_len] + [0] * (max_len - len(shifted))
    return padded

# Aggregate daily features for each video
agg_funcs = {
    "author_id": "last",
    "video_duration": "last",
    "play_progress": "mean",
    "video_tag_id": "last",
    "play_cnt": "sum",
    "like_cnt": "sum",
    "share_cnt": "sum",
    "comment_cnt": "sum"
}
videos = item_daily_features.groupby("video_id").agg(agg_funcs)
# Merge with static item features
videos = videos.merge(item_categories, on="video_id", how="left").set_index("video_id")

# Filter out videos with extremely long durations
videos = videos[videos["video_duration"] <= 20000]
videos["video_duration"] = videos["video_duration"].fillna(0).astype(int)
# Optionally, normalize video_duration (currently commented out)
# videos["video_duration"] = (videos["video_duration"] - videos["video_duration"].min()) / (videos["video_duration"].max() - videos["video_duration"].min())

# Fill missing tag IDs and shift for embedding compatibility
videos["video_tag_id"] = videos["video_tag_id"].fillna(-1).astype(int) + 1

# Not using play, like, and share counts for now

# Pad and shift feature lists for each video
videos["feat"] = videos["feat"].apply(shift_and_pad)

(10728, 9)
