Advanced Modeling

# We will build an advanced recommendation model using the getINNOtized dataset. This involves:
# - Loading the preprocessed user-item interaction matrix with anomaly filtering (users with >100 events/day removed).
# - Training an SVD model with tuned parameters for collaborative filtering.
# - Evaluating the model using RMSE and Precision@5.

# This model will address business questions:
# - Q1: Personalization (collaborative filtering).
# - Q5: Anomaly filtering (removing bots/outliers).

## Setup

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
import joblib
import os
from collections import defaultdict

# --- Optimized Data Loading ---
preprocessed_data_dir = "../data/preprocessed_data/"

# Vectorized mapping loader with error handling
def load_mapping(file_path):
    try:
        df = pd.read_csv(file_path, header=None, skiprows=1, names=['key', 'value'])
        return df.set_index('key')['value'].to_dict()
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return {}

# Load mappings
user_ids = load_mapping(os.path.join(preprocessed_data_dir, "user_ids.csv"))
item_ids = load_mapping(os.path.join(preprocessed_data_dir, "item_ids.csv"))

# Reverse mappings
user_idx_to_id = {v: k for k, v in user_ids.items()}
item_idx_to_id = {v: k for k, v in item_ids.items()}

# --- Sparse Matrix Optimization ---
with open(os.path.join(preprocessed_data_dir, "user_item_sparse.pkl"), "rb") as f:
    user_item_matrix = joblib.load(f)

# Convert to CSR for optimization, then to COO for DataFrame conversion
user_item_csr = user_item_matrix.tocsr()
user_item_coo = user_item_csr.tocoo()

# --- Precompute Category Mapping ---
item_features = pd.read_csv(os.path.join(preprocessed_data_dir, "item_features.csv"))
category_to_items = item_features.groupby('categoryid')['item_idx'].agg(list).to_dict()
item_to_category = item_features.set_index('item_idx')['categoryid'].to_dict()

# --- Convert sparse matrix to DataFrame ---
interactions_df = pd.DataFrame(
    [(user_idx_to_id[row], item_idx_to_id[col], rating) for row, col, rating in zip(user_item_coo.row, user_item_coo.col, user_item_coo.data)],
    columns=['user_id', 'item_id', 'rating']
)

print("Interactions_df shape:", interactions_df.shape)
print("Rating distribution:\n", interactions_df['rating'].value_counts())

# --- Model Optimization ---
svd = SVD(n_factors=20, n_epochs=15, lr_all=0.01, random_state=42)

# Load full dataset into Surprise
data = Dataset.load_from_df(interactions_df[['user_id', 'item_id', 'rating']], Reader(rating_scale=(1, 5)))
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Fit model and compute predictions
svd.fit(trainset)
svd_predictions = svd.test(testset)

# --- Optimized Recommendation Functions ---
def get_similar_items(item_id, top_n=5):
    item_idx = item_ids.get(item_id)
    if item_idx is None or item_idx not in item_to_category:
        return interactions_df['item_id'].sample(min(top_n, len(interactions_df))).tolist()
    
    category = item_to_category[item_idx]
    similar_items = category_to_items.get(category, [])[:top_n * 2]
    return [item_idx_to_id[item] for item in similar_items if item != item_idx][:top_n]

def hybrid_recommendation(user_id, top_n=5, alpha=0.7):
    user_preds = [(pred.iid, pred.est) for pred in svd_predictions if pred.uid == user_id]
    svd_recs = [iid for iid, _ in sorted(user_preds, key=lambda x: x[1], reverse=True)][:top_n] if user_preds else []
    
    if not svd_recs:
        user_ratings = interactions_df[interactions_df['user_id'] == user_id]
        svd_recs = user_ratings['item_id'].sample(min(top_n, len(user_ratings))).tolist() if not user_ratings.empty else \
                   interactions_df['item_id'].sample(top_n).tolist()
    
    cb_recs = get_similar_items(svd_recs[0], top_n) if svd_recs else interactions_df['item_id'].sample(top_n).tolist()
    combined = pd.Series(svd_recs + cb_recs).value_counts().index.tolist()
    return combined[:top_n]

# --- Optimized Evaluation ---
def precision_at_k_hybrid(test_df, k=5, threshold=3):
    test_df = pd.DataFrame(testset, columns=['user_id', 'item_id', 'rating'])
    sampled_users = test_df['user_id'].drop_duplicates().sample(frac=0.2, random_state=42)
    
    precisions = []
    for uid in sampled_users:
        recs = hybrid_recommendation(uid, k)
        user_ratings = test_df[(test_df['user_id'] == uid) & (test_df['rating'] >= threshold)]
        relevant_items = set(user_ratings['item_id'])
        hits = len(set(recs) & relevant_items)
        precisions.append(hits / k if k > 0 else 0)
    return np.mean(precisions)

# --- Execute ---
if __name__ == "__main__":
    svd.fit(trainset)  # Ensure model is fitted
    rmse = accuracy.rmse(svd_predictions, verbose=False)
    precision = precision_at_k_hybrid(pd.DataFrame(testset, columns=['user_id', 'item_id', 'rating']), k=5, threshold=3)
    
    print(f"Optimized RMSE: {rmse:.4f}")
    print(f"Sampled Precision@5: {precision:.4f}")

Interactions_df shape: (811499, 3)
Rating distribution:
 rating
1    719082
2     48417
5     18494
3     17191
4      8315
Name: count, dtype: int64
Optimized RMSE: 0.7176
Sampled Precision@5: 0.0110


In [4]:
import os
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import implicit
from implicit.nearest_neighbours import bm25_weight
from implicit.evaluation import precision_at_k
import joblib
from numba import jit
from tqdm import tqdm
import optuna

# Configuration (Optimized for 12GB RAM)
DATA_DIR = "../data/preprocessed_data/"
MODEL_DIR = "../models/"
RESULTS_DIR = "../results/"
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Hardware-Aware Settings
MAX_THREADS = 4
BATCH_SIZE = 512
MAX_MEMORY_USAGE = 10_000_000_000

def load_sparse_matrix():
    """Memory-mapped sparse matrix loading"""
    matrix_path = os.path.join(DATA_DIR, "user_item_sparse.pkl")
    if os.path.exists(matrix_path):
        matrix = joblib.load(matrix_path)
        return matrix.tocsr().astype(np.float32)
    else:
        raise FileNotFoundError(f"Sparse matrix not found at {matrix_path}")

class OptimizedHybridRecommender:
    def __init__(self, n_factors=80, iterations=12, regularization=0.1, 
                 alpha=0.8, k=5, diversity_weight=0.2):
        self.model = implicit.als.AlternatingLeastSquares(
            factors=n_factors,
            iterations=iterations,
            regularization=regularization,
            random_state=42,
            use_gpu=False,
            num_threads=MAX_THREADS
        )
        self.alpha = alpha
        self.k = k
        self.diversity_weight = diversity_weight
        self.item_features = None
        self.user_item_matrix = None
        self.popular_items = None

    def fit(self, user_item_matrix, item_features):
        weighted_matrix = self._batch_weight(user_item_matrix)
        self.model.fit(weighted_matrix.T * self.alpha)
        self.item_features = item_features.astype(np.float32)
        self.user_item_matrix = user_item_matrix
        self._calculate_popularity()

    def _batch_weight(self, matrix):
        batch_size = BATCH_SIZE
        n_users = matrix.shape[0]
        weighted_data = []
        for start in tqdm(range(0, n_users, batch_size), desc="Batch Weighting"):
            end = min(start + batch_size, n_users)
            batch = matrix[start:end].copy()
            weighted_batch = bm25_weight(batch, K1=100, B=0.8)
            weighted_data.append(weighted_batch)
        return csr_matrix(np.vstack(weighted_data)) if weighted_data else csr_matrix(matrix.shape)

    def _calculate_popularity(self):
        item_counts = np.zeros(self.user_item_matrix.shape[1], dtype=np.float32)
        for i in tqdm(range(self.user_item_matrix.shape[1]), desc="Popularity Calc"):
            item_counts[i] = self.user_item_matrix[:,i].sum()
        self.popular_items = np.argpartition(-item_counts, self.k*2)[:self.k*2]

    def recommend(self, user_id, user_items):
        ids, _ = self.model.recommend(
            user_id, 
            user_items,
            N=self.k,
            filter_already_liked=True,
            recalculate_user=False
        )
        if len(ids) < self.k:
            needed = self.k - len(ids)
            ids = np.concatenate([ids, self.popular_items[:needed]])
        return ids[:self.k]

def optimize_hyperparameters(trial):
    return {
        'n_factors': trial.suggest_int('n_factors', 64, 128),
        'iterations': trial.suggest_int('iterations', 8, 15),
        'regularization': trial.suggest_float('regularization', 0.05, 0.3),
        'alpha': trial.suggest_float('alpha', 0.5, 1.2),
        'diversity_weight': trial.suggest_float('diversity_weight', 0.1, 0.3)
    }

def main():
    start_mem = psutil.virtual_memory().used if psutil else 0
    print("Loading data...")
    user_item_matrix = load_sparse_matrix()
    # Load item features from CSV instead of .pkl
    item_features_df = pd.read_csv(os.path.join(DATA_DIR, "item_features.csv"))
    # Convert item_features_df to a sparse matrix if needed (e.g., one-hot encode categories)
    item_features = pd.get_dummies(item_features_df['categoryid']).astype(np.float32).values
    
    # Train-test split
    train, test = implicit.evaluation.train_test_split(
        user_item_matrix, 
        train_percentage=0.8,
        random_state=42
    )
    
    # Hyperparameter optimization
    print("Optimizing hyperparameters...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, train, test), n_trials=10, timeout=1200)
    
    # Train final model
    print("Training final model...")
    best_params = study.best_params
    recommender = OptimizedHybridRecommender(**best_params)
    recommender.fit(train, item_features)
    
    # Evaluate
    print("Evaluating model...")
    precision = precision_at_k(recommender.model, train, test, K=5, num_threads=MAX_THREADS)
    
    print(f"\nFinal Metrics:")
    print(f"Precision@5: {precision:.4f}")
    
    # Save model
    joblib.dump(recommender, os.path.join(MODEL_DIR, "optimized_recommender.pkl"), compress=3)

def objective(trial, train, test):
    if psutil and (psutil.virtual_memory().available < MAX_MEMORY_USAGE):
        raise optuna.TrialPruned("Memory limit exceeded")
    
    params = optimize_hyperparameters(trial)
    model = OptimizedHybridRecommender(**params)
    model.fit(train, item_features)
    return precision_at_k(model.model, train, test, K=5, num_threads=MAX_THREADS)

if __name__ == "__main__":
    start_time = time.time()
    try:
        import psutil
    except ImportError:
        psutil = None
    
    main()
    print(f"\nTotal execution time: {(time.time() - start_time)/60:.2f} minutes")

Loading data...


[I 2025-03-13 11:36:34,294] A new study created in memory with name: no-name-72795ab1-82d0-42c7-938b-137fc98ead13
[I 2025-03-13 11:36:34,370] Trial 0 pruned. Memory limit exceeded
[I 2025-03-13 11:36:34,387] Trial 1 pruned. Memory limit exceeded
[I 2025-03-13 11:36:34,414] Trial 2 pruned. Memory limit exceeded
[I 2025-03-13 11:36:34,429] Trial 3 pruned. Memory limit exceeded


Optimizing hyperparameters...


[I 2025-03-13 11:36:34,443] Trial 4 pruned. Memory limit exceeded
[I 2025-03-13 11:36:34,458] Trial 5 pruned. Memory limit exceeded
[I 2025-03-13 11:36:34,474] Trial 6 pruned. Memory limit exceeded
[I 2025-03-13 11:36:34,499] Trial 7 pruned. Memory limit exceeded
[I 2025-03-13 11:36:34,515] Trial 8 pruned. Memory limit exceeded
[I 2025-03-13 11:36:34,643] Trial 9 pruned. Memory limit exceeded


Training final model...


ValueError: No trials are completed yet.

## Summary

# We have trained an SVD model with tuned parameters on the corrected user-item interaction matrix.
# - RMSE and Precision@5 indicate the model's performance.
# - This model addresses Q1 (personalization) and Q5 (anomaly filtering).

## Next Steps

# - Address Q4 (popular products): Analyze item popularity.
# - Address Q6 (diversity): Incorporate item and category features for a hybrid model.
# - Address Q7 (algorithm comparison): Compare SVD with other algorithms (e.g., KNN, NMF).