# SVD (Singular Value Decomposition) Experiment


In [2]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_reading import read_ratings_file
from src.evaluation import temporal_split, evaluate_rmse, evaluate_mape, evaluate_precision_at_k
from src.models.svd import solve_with_svd

np.random.seed(42)

## Load and Split Data

Using temporal split to ensure realistic evaluation (train on past, test on future).

In [5]:
ratings = read_ratings_file()
print(f"Loaded {len(ratings)} ratings")

# Temporal split: train on past, validate on middle, test on future
train, val, test = temporal_split(ratings, test_ratio=0.2, val_ratio=0.1)

Loaded 1000209 ratings
Train set size: (700148, 4)
Validation set size: (100020, 4)
Test set size: (200041, 4)
Train timeframe: 2000-04-25 23:05:32 - 2000-11-22 03:06:26
Val timeframe: 2000-11-22 03:06:30 - 2000-12-02 14:52:18
Test timeframe: 2000-12-02 14:52:28 - 2003-02-28 17:49:50


## Create Ratings Matrix

In [6]:
train_matrix = train.pivot_table(
    index='user_id',
    columns='movie_id',
    values='rating',
    fill_value=0
)

print(f"Training matrix shape: {train_matrix.shape}")
print(f"Sparsity: {(train_matrix == 0).sum().sum() / (train_matrix.shape[0] * train_matrix.shape[1]) * 100:.2f}%")

# Filter validation and test sets to only include users/movies in training set (no cold start)
train_users = train.user_id.unique()
train_movies = train.movie_id.unique()

val = val[(val.user_id.isin(train_users)) & (val.movie_id.isin(train_movies))]
test = test[(test.user_id.isin(train_users)) & (test.movie_id.isin(train_movies))]

print(f"\nFiltered validation set size: {val.shape[0]} ratings")
print(f"Filtered test set size: {test.shape[0]} ratings")

Training matrix shape: (4870, 3633)
Sparsity: 96.04%

Filtered validation set size: 26246 ratings
Filtered test set size: 84804 ratings


## Hyperparameter Tuning

We'll train models with different hyperparameter combinations and select the best based on validation set performance.

**Hyperparameters to tune:**
- **k** (latent factors): Number of dimensions in latent space
- **lr** (learning rate): Step size for gradient descent

For faster experimentation, we use a reduced number of epochs during tuning.

In [5]:
param_grid = {
    'k': [10, 40],
    'lr': [0.003, 0.01],
}

results = []

for k in param_grid['k']:
    for lr in param_grid['lr']:
        print(f"Training: k={k}, lr={lr}")
        
        predictions = solve_with_svd(
            train_matrix, 
            k=k, 
            n_epochs=30,  
            lr=lr, 
            reg=0.05,
            verbose=True
        )
        
        def predict(user_id, movie_id, pred_matrix=predictions):
            try:
                return pred_matrix.loc[user_id, movie_id]
            except (KeyError, IndexError):
                return np.nan
        
        val_rmse = evaluate_rmse(test=val, predict_fn=predict)
        
        results.append({
            'k': k,
            'lr': lr,
            'val_rmse': val_rmse,
            'predictions': predictions
        })
        
        print(f"  Validation RMSE: {val_rmse:.4f}\n")



Training: k=10, lr=0.003
Epoch 1/30
Epoch 6/30
Epoch 11/30
Epoch 16/30
  Epoch 20/30: RMSE = 0.8772
Epoch 21/30
Epoch 26/30
  Validation RMSE: 0.9434

Training: k=10, lr=0.01
Epoch 1/30
Epoch 6/30
Epoch 11/30
Epoch 16/30
  Epoch 20/30: RMSE = 0.8166
Epoch 21/30
Epoch 26/30
  Validation RMSE: 0.9118

Training: k=40, lr=0.003
Epoch 1/30
Epoch 6/30
Epoch 11/30
Epoch 16/30
  Epoch 20/30: RMSE = 0.8548
Epoch 21/30
Epoch 26/30
  Validation RMSE: 0.9469

Training: k=40, lr=0.01
Epoch 1/30
Epoch 6/30
Epoch 11/30
Epoch 16/30
  Epoch 20/30: RMSE = 0.7515
Epoch 21/30
Epoch 26/30
  Validation RMSE: 0.9218



## Select Best Model

Find the hyperparameters with lowest validation RMSE.

In [6]:
results_df = pd.DataFrame([{k: v for k, v in r.items() if k != 'predictions'} for r in results])
results_df = results_df.sort_values('val_rmse')

best_config = results_df.iloc[0]
best_params = {
    'k': int(best_config['k']),
    'lr': float(best_config['lr']),
}

print(f"  k={best_params['k']}, lr={best_params['lr']}")
print(f"  Validation RMSE: {best_config['val_rmse']:.4f}")

  k=10, lr=0.01
  Validation RMSE: 0.9118


## Retrain Best Model

Retrain with best hyperparameters using more epochs for better convergence.

In [7]:
final_predictions = solve_with_svd(
    train_matrix,
    k=best_params['k'],
    n_epochs=50,  
    lr=best_params['lr'],
    reg=0.05,
    verbose=True
)


Epoch 1/50
Epoch 6/50
Epoch 11/50
Epoch 16/50
  Epoch 20/50: RMSE = 0.8174
Epoch 21/50
Epoch 26/50
Epoch 31/50
Epoch 36/50
  Epoch 40/50: RMSE = 0.7987
Epoch 41/50
Epoch 46/50


## Final Test Evaluation

Evaluate the best model on the held-out test set.

In [9]:
def predict_final(user_id, movie_id):
    try:
        return final_predictions.loc[user_id, movie_id]
    except (KeyError, IndexError):
        return np.nan

def recommend_k(user_id, test, k, **kwargs):
    """Recommend top-k movies for a user based on predicted ratings."""
    try:
        user_preds = final_predictions.loc[user_id]
        # Return top k movie IDs
        return user_preds.nlargest(k).index.values
    except (KeyError, IndexError):
        return np.array([])

print("Final Test Set Performance:")
print("=" * 50)

test_rmse = evaluate_rmse(test=test, predict_fn=predict_final)
print(f"RMSE: {test_rmse:.4f}")

test_mape = evaluate_mape(test=test, predict_fn=predict_final)
print(f"MAPE: {test_mape:.2f}%")

# Evaluate ranking metrics at different K values
for k in [5, 10, 20]:
    precision_k = evaluate_precision_at_k(test=test, recommend_k_fn=recommend_k, k=k)
    print(f"\nPrecision@{k}: {precision_k:.4f}")

print("\n" + "=" * 50)
print(f"Best hyperparameters: k={best_params['k']}, lr={best_params['lr']}")

Final Test Set Performance:
RMSE: 0.9151
MAPE: 27.79%

Precision@5: 0.0637

Precision@10: 0.0632

Precision@20: 0.0633

Best hyperparameters: k=10, lr=0.01
