In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy
import joblib
import os

# Define paths
preprocessed_data_dir = "../data/preprocessed_data/"

# Load user and item ID mappings
try:
    user_ids = pd.read_csv(os.path.join(preprocessed_data_dir, "user_ids.csv"), index_col="Unnamed: 0")["0"].to_dict()
    item_ids = pd.read_csv(os.path.join(preprocessed_data_dir, "item_ids.csv"), index_col="Unnamed: 0")["0"].to_dict()
    user_idx_to_id = {v: k for k, v in user_ids.items()}
    item_idx_to_id = {v: k for k, v in item_ids.items()}
except KeyError as e:
    print(f"Error loading ID mappings: {e}. Check CSV structure.")
    raise

# Load the sparse matrix using joblib
try:
    with open(os.path.join(preprocessed_data_dir, "user_item_sparse.pkl"), "rb") as f:
        user_item_matrix = joblib.load(f)
except Exception as e:
    print(f"Error loading sparse matrix: {e}. Ensure it’s regenerated with current NumPy.")
    raise

# Convert sparse matrix to DataFrame for surprise
user_item_coo = coo_matrix(user_item_matrix)
rows, cols, data = user_item_coo.row, user_item_coo.col, user_item_coo.data
interactions_list = [(user_idx_to_id[row], item_idx_to_id[col], rating) for row, col, rating in zip(rows, cols, data)]
interactions_df = pd.DataFrame(interactions_list, columns=['user_id', 'item_id', 'rating'])

# Define the reader for surprise
reader = Reader(rating_scale=(1, 5))

# Load data into surprise Dataset
data = Dataset.load_from_df(interactions_df[['user_id', 'item_id', 'rating']], reader)

# Tune model with cross-validation
param_grid = {'n_factors': [20, 50, 100], 'n_epochs': [20, 30], 'lr_all': [0.002, 0.005]}
from surprise.model_selection import GridSearchCV

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# Best RMSE score
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best parameters: {gs.best_params['rmse']}")

# Train with best parameters
best_svd = SVD(n_factors=gs.best_params['rmse']['n_factors'],
               n_epochs=gs.best_params['rmse']['n_epochs'],
               lr_all=gs.best_params['rmse']['lr_all'],
               random_state=42)
trainset = data.build_full_trainset()
best_svd.fit(trainset)

# Evaluate on a split (for quick check)
trainset_split, testset = train_test_split(data, test_size=0.1, random_state=42)
predictions = best_svd.test(testset)
rmse = accuracy.rmse(predictions)
print(f"Tuned RMSE on Test Set: {rmse:.4f}")

# Simplified Precision@K
from collections import defaultdict

def precision_at_k(predictions, k=5, threshold=2):  # Lowered threshold to 2
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = []
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel_and_rec_k = sum((true_r >= threshold and est >= threshold)
                            for est, true_r in user_ratings[:k])
        n_rec_k = min(k, len(user_ratings))
        precisions.append(n_rel_and_rec_k / n_rec_k if n_rec_k > 0 else 0)

    return np.mean(precisions) if precisions else 0

precision_at_k = precision_at_k(predictions, k=5, threshold=2)
print(f"Tuned Precision@5: {precision_at_k:.4f}")

Best RMSE: 1.60040605054629
Best parameters: {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.002}
RMSE: 1.3819
Tuned RMSE on Test Set: 1.3819
Tuned Precision@5: 0.0219
