In [2]:
# Import required libraries
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict
import joblib
import os

# Define the path to your preprocessed data
preprocessed_data_dir = "../data/preprocessed_data/"

# Load user-item interaction data
user_ids = pd.read_csv(os.path.join(preprocessed_data_dir, "user_ids.csv"), header=None, index_col=0).to_dict()[0]
item_ids = pd.read_csv(os.path.join(preprocessed_data_dir, "item_ids.csv"), header=None, index_col=0).to_dict()[0]

# Load the sparse matrix using joblib
with open(os.path.join(preprocessed_data_dir, "user_item_sparse.pkl"), "rb") as f:
    user_item_matrix = joblib.load(f)

# Convert sparse matrix to DataFrame for surprise
from scipy.sparse import coo_matrix
user_item_coo = coo_matrix(user_item_matrix)
rows, cols, data = user_item_coo.row, user_item_coo.col, user_item_coo.data

# Create a list of (user, item, rating) tuples
interactions_list = [
    (list(user_ids.keys())[list(user_ids.values()).index(row)],
     list(item_ids.keys())[list(item_ids.values()).index(col)],
     rating)
    for row, col, rating in zip(rows, cols, data)
]

# Create a DataFrame
interactions_df = pd.DataFrame(interactions_list, columns=['user_id', 'item_id', 'rating'])

# Define the reader for surprise
reader = Reader(rating_scale=(1, 5))

# Load data into surprise Dataset
data = Dataset.load_from_df(interactions_df[['user_id', 'item_id', 'rating']], reader)

# Split into train and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD model
svd = SVD(n_factors=50, random_state=42)
svd.fit(trainset)

# Evaluate on test set
predictions = svd.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE on Test Set: {rmse:.4f}")

# Calculate Precision@K (K=5)
def precision_recall_at_k(predictions, k=5, threshold=3):
    """Return precision and recall at k metrics for each user."""
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=3)
precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
print(f"Precision@5: {precision_at_k:.4f}")

KeyError: 0