In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model

In [9]:
# Set the path to the data directory
data_path = "../data/KuaiRec/data/"

# Load main user-item interaction matrix
print("Loading big matrix...")
big_matrix = pd.read_csv(data_path + "big_matrix.csv")

# Load a smaller matrix for testing/validation
print("Loading small matrix...")
small_matrix = pd.read_csv(data_path + "small_matrix.csv")

# Load social network data and convert friend_list from string to list
print("Loading social network...")
social_network = pd.read_csv(data_path + "social_network.csv")
social_network["friend_list"] = social_network["friend_list"].map(eval)

# Load item features and convert feat from string to list
print("Loading item features...")
item_categories = pd.read_csv(data_path + "item_categories.csv")
item_categories["feat"] = item_categories["feat"].map(eval)

# Load user features
print("Loading user features...")
user_features = pd.read_csv(data_path + "user_features.csv")

# Load item daily features
print("Loading items' daily features...")
item_daily_features = pd.read_csv(data_path + "item_daily_features.csv")

print("All data loaded.")

Loading big matrix...
Loading small matrix...
Loading social network...
Loading item features...
Loading user features...
Loading items' daily features...
All data loaded.


In [10]:
# Prepare the interaction matrix for training
interaction_matrix = big_matrix[["user_id", "video_id", "watch_ratio"]]
# Filter out interactions with watch_ratio > 3
interaction_matrix = interaction_matrix[interaction_matrix['watch_ratio'] <= 3]
# Normalize watch_ratio to [0, 1]
interaction_matrix['watch_ratio'] = (
    (interaction_matrix['watch_ratio'] - interaction_matrix['watch_ratio'].min()) /
    (interaction_matrix['watch_ratio'].max() - interaction_matrix['watch_ratio'].min())
)

# Prepare the test matrix in the same way
test_matrix = small_matrix[["user_id", "video_id", "watch_ratio"]]
test_matrix = test_matrix[test_matrix["watch_ratio"] <= 3]
test_matrix['watch_ratio'] = (
    (test_matrix['watch_ratio'] - test_matrix['watch_ratio'].min()) /
    (test_matrix['watch_ratio'].max() - test_matrix['watch_ratio'].min())
)

In [11]:
# Define a simple matrix factorization model using Keras subclassing API
class MatrixFactorizationModel(Model):
    def __init__(self, num_users, num_items, latent_dim=32):
        super(MatrixFactorizationModel, self).__init__()
        # Embedding layer for users
        self.user_embedding = layers.Embedding(num_users, latent_dim)
        # Embedding layer for items
        self.item_embedding = layers.Embedding(num_items, latent_dim)
    
    def call(self, inputs):
        user_input, item_input = inputs
        # Get user and item embeddings
        user_vec = self.user_embedding(user_input)
        item_vec = self.item_embedding(item_input)
        # Compute dot product between user and item embeddings
        dot_product = tf.reduce_sum(user_vec * item_vec, axis=1, keepdims=True)
        return dot_product

In [12]:
# Helper function to generate TensorFlow datasets from pandas DataFrames
def generate_dataset(matrix):
    return tf.data.Dataset.from_tensor_slices(
        ((matrix["user_id"].values, matrix["video_id"].values), matrix["watch_ratio"].values)
    )

# Create training and testing datasets
train_dataset = generate_dataset(big_matrix).shuffle(buffer_size=len(big_matrix)).batch(256).prefetch(tf.data.AUTOTUNE)
test_dataset = generate_dataset(small_matrix).shuffle(buffer_size=len(small_matrix)).batch(256).prefetch(tf.data.AUTOTUNE)

In [13]:
# Define model parameters
num_users = big_matrix['user_id'].max() + 1
num_items = big_matrix['video_id'].max() + 1
# Instantiate the matrix factorization model
model = MatrixFactorizationModel(num_users=num_users, num_items=num_items, latent_dim=64)

# Compile the model with Adam optimizer and mean squared error loss
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
    loss=tf.keras.losses.MeanSquaredError(),
    metrics=["mae"]
)

In [14]:
# Train the model
model.fit(train_dataset, validation_data=test_dataset, epochs=1, batch_size=256)

# Evaluate the model on the test dataset
loss, mae = model.evaluate(test_dataset)
print(f"Test Loss: {loss:.4f}, MAE: {mae:.4f}")

# Predict watch ratios for user 14 and display results
user_14_df = small_matrix[small_matrix['user_id'] == 14]
video_ids = user_14_df['video_id'].values
user_ids = np.full(len(video_ids), 14)
predictions = model.predict([user_ids, video_ids], batch_size=256).flatten()

user_14_df['predicted_watch_ratio'] = predictions
user_14_df.dropna(inplace=True)

display(user_14_df)

2025-05-17 16:18:20.771649: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:452] ShuffleDatasetV3:11: Filling up shuffle buffer (this may take a while): 8025889 of 12530806
2025-05-17 16:18:26.620711: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:482] Shuffle buffer filled.


[1m48949/48949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 2ms/step - loss: 3.2874 - mae: 0.8193 - val_loss: 3.0763 - val_mae: 1.0074
[1m18268/18268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 972us/step - loss: 2.9860 - mae: 1.0070
Test Loss: 3.0763, MAE: 1.0074
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_14_df['predicted_watch_ratio'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_14_df.dropna(inplace=True)


Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,predicted_watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1.593898e+09,0.722103,2.945696
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1.593898e+09,1.907377,3.038282
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1.593898e+09,2.063311,1.575294
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1.593898e+09,0.566388,1.938938
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1.593899e+09,0.418364,0.506174
...,...,...,...,...,...,...,...,...,...
3229,14,5027,11446,7167,2020-09-03 20:44:33.108,20200903.0,1.599137e+09,1.597042,0.879346
3230,14,4222,4479,7967,2020-09-04 01:57:35.117,20200904.0,1.599156e+09,0.562194,1.822003
3231,14,6316,8499,58267,2020-09-05 02:44:26.093,20200905.0,1.599245e+09,0.145863,0.365742
3232,14,2759,8442,13067,2020-09-05 03:19:20.009,20200905.0,1.599247e+09,0.646055,0.369776
