In [3]:
# Import required libraries
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
import joblib
import os

# Define paths
preprocessed_data_dir = "../data/preprocessed_data/"
raw_data_dir = "../data/"

# Load user behavior for anomaly filtering
user_behavior = pd.read_csv(os.path.join(preprocessed_data_dir, "user_behavior.csv"))
normal_users = user_behavior[user_behavior['events_per_day'] <= 100]['visitorid'].tolist()  # Changed 'user_id' to 'visitorid'

# Load events data for time features
events = pd.read_csv(os.path.join(raw_data_dir, "events.csv"))
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')
events['month'] = events['timestamp'].dt.month
events['day_of_week'] = events['timestamp'].dt.dayofweek

# Filter out anomalous users
events = events[events['visitorid'].isin(normal_users)]

# Load user and item ID mappings
user_ids = pd.read_csv(os.path.join(preprocessed_data_dir, "user_ids.csv"), index_col="Unnamed: 0")["0"].to_dict()
item_ids = pd.read_csv(os.path.join(preprocessed_data_dir, "item_ids.csv"), index_col="Unnamed: 0")["0"].to_dict()
user_idx_to_id = {v: k for k, v in user_ids.items()}
item_idx_to_id = {v: k for k, v in item_ids.items()}

# Load the sparse matrix
with open(os.path.join(preprocessed_data_dir, "user_item_sparse.pkl"), "rb") as f:
    user_item_matrix = joblib.load(f)

# Convert sparse matrix to DataFrame
user_item_coo = coo_matrix(user_item_matrix)
rows, cols, data = user_item_coo.row, user_item_coo.col, user_item_coo.data
interactions_list = [(user_idx_to_id[row], item_idx_to_id[col], rating) for row, col, rating in zip(rows, cols, data)]
interactions_df = pd.DataFrame(interactions_list, columns=['user_id', 'item_id', 'rating'])

# Merge with time features
interactions_df = interactions_df.merge(events[['visitorid', 'itemid', 'month', 'day_of_week']], 
                                        left_on=['user_id', 'item_id'], 
                                        right_on=['visitorid', 'itemid'], 
                                        how='left')

# For simplicity, use month as a feature (can expand to day_of_week, etc.)
# Create a new rating adjusted by month (example: boost ratings in December)
interactions_df['adjusted_rating'] = interactions_df['rating']
interactions_df.loc[interactions_df['month'] == 12, 'adjusted_rating'] *= 1.2  # Boost December ratings

# Define the reader for surprise
reader = Reader(rating_scale=(1, 6))  # Adjusted for potential boost

# Load data into surprise Dataset
data = Dataset.load_from_df(interactions_df[['user_id', 'item_id', 'adjusted_rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD model with tuned parameters
svd = SVD(n_factors=20, n_epochs=20, lr_all=0.002, random_state=42)
svd.fit(trainset)

# Evaluate
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
print(f"RMSE on Test Set: {rmse:.4f}")

# Precision@K
from collections import defaultdict

def precision_at_k(predictions, k=5, threshold=2):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    precisions = []
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel_and_rec_k = sum((true_r >= threshold and est >= threshold) for est, true_r in user_ratings[:k])
        n_rec_k = min(k, len(user_ratings))
        precisions.append(n_rel_and_rec_k / n_rec_k if n_rec_k > 0 else 0)
    return np.mean(precisions) if precisions else 0

precision_at_k = precision_at_k(predictions, k=5, threshold=2)
print(f"Precision@5: {precision_at_k:.4f}")

RMSE: 8.0135
RMSE on Test Set: 8.0135
Precision@5: 0.1335
