# ALS (Alternating Least Squares) Experiment

In [2]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_reading import read_ratings_file
from src.evaluation import temporal_split, evaluate_rmse, evaluate_mape, evaluate_precision_at_k
from src.models.als import solve_with_als

np.random.seed(42)

## Load and Split Data

Using temporal split to ensure realistic evaluation (train on past, test on future).

In [4]:
ratings = read_ratings_file()
print(f"Loaded {len(ratings)} ratings")

# Temporal split: train on past, validate on middle, test on future
train, val, test = temporal_split(ratings, test_ratio=0.2, val_ratio=0.1)

Loaded 1000209 ratings
Train set size: (700148, 4)
Validation set size: (100020, 4)
Test set size: (200041, 4)
Train timeframe: 2000-04-25 23:05:32 - 2000-11-22 03:06:26
Val timeframe: 2000-11-22 03:06:30 - 2000-12-02 14:52:18
Test timeframe: 2000-12-02 14:52:28 - 2003-02-28 17:49:50


## Create Ratings Matrix

In [5]:
train_matrix = train.pivot_table(
    index='user_id',
    columns='movie_id',
    values='rating',
    fill_value=0
)

print(f"Training matrix shape: {train_matrix.shape}")
print(f"Sparsity: {(train_matrix == 0).sum().sum() / (train_matrix.shape[0] * train_matrix.shape[1]) * 100:.2f}%")

# Filter validation and test sets to only include users/movies in training set (no cold start)
train_users = train.user_id.unique()
train_movies = train.movie_id.unique()

val = val[(val.user_id.isin(train_users)) & (val.movie_id.isin(train_movies))]
test = test[(test.user_id.isin(train_users)) & (test.movie_id.isin(train_movies))]

print(f"\nFiltered validation set size: {val.shape[0]} ratings")
print(f"Filtered test set size: {test.shape[0]} ratings")

Training matrix shape: (4870, 3633)
Sparsity: 96.04%

Filtered validation set size: 26246 ratings
Filtered test set size: 84804 ratings


## Hyperparameter Tuning

We'll train models with different hyperparameter combinations and select the best based on validation set performance.

**Hyperparameters to tune:**
- **factors**: Number of latent factors
- **alpha**: Confidence scaling parameter for implicit feedback

In [5]:
param_grid = {
    'factors': [10, 40],
    'alpha': [20, 60],
}

results = []

for factors in param_grid['factors']:
    for alpha in param_grid['alpha']:
        print(f"Training: factors={factors}, alpha={alpha}")
        
        predictions = solve_with_als(
            train_matrix,
            alpha=alpha,
            iterations=1,
            factors=factors,
            regularization=0.1,
            verbose=False
        )
        
        def predict(user_id, movie_id, pred_matrix=predictions):
            try:
                return pred_matrix.loc[user_id, movie_id]
            except (KeyError, IndexError):
                return np.nan
        
        val_rmse = evaluate_rmse(test=val, predict_fn=predict)
        
        results.append({
            'factors': factors,
            'alpha': alpha,
            'regularization': 0.1,
            'val_rmse': val_rmse,
        })
        
        print(f"  Validation RMSE: {val_rmse:.4f}\n")


Training: factors=10, alpha=20
  Validation RMSE: 3.6193

Training: factors=10, alpha=60
  Validation RMSE: 3.5392

Training: factors=40, alpha=20
  Validation RMSE: 3.5752

Training: factors=40, alpha=60
  Validation RMSE: 3.4497



## Select Best Model

Find the hyperparameters with lowest validation RMSE.

In [6]:
results_df = pd.DataFrame([{k: v for k, v in r.items() if k != 'predictions'} for r in results])
results_df = results_df.sort_values('val_rmse')

best_config = results_df.iloc[0]
best_params = {
    'factors': int(best_config['factors']),
    'alpha': float(best_config['alpha']),
}

print(f"\nBest hyperparameters:")
print(f"  factors={best_params['factors']}, alpha={best_params['alpha']}")
print(f"  Validation RMSE: {best_config['val_rmse']:.4f}")


Best hyperparameters:
  factors=40, alpha=60.0
  Validation RMSE: 3.4497


## Retrain Best Model

Retrain with best hyperparameters using more iterations for better convergence.

In [6]:
final_predictions = solve_with_als(
    train_matrix,
    alpha=best_60,
    iterations=15, 
    factors=best_rams['factors'],
    regularization=0.1,
    verbose=True
)



Iteration 1/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 2/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 3/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 4/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 5/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 6/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 7/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 8/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 9/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 10/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 11/15
Solving for users (fixed items)...
Solving for items (fixed users)...

Iteration 12/15
Solving for users (fixed items)...


## Final Test Evaluation

Evaluate the best model on the held-out test set.

In [8]:
def predict_final(user_id, movie_id):
    try:
        return final_predictions.loc[user_id, movie_id]
    except (KeyError, IndexError):
        return np.nan

def recommend_k(user_id, test, k, **kwargs):
    try:
        user_preds = final_predictions.loc[user_id]
        return user_preds.nlargest(k).index.values
    except (KeyError, IndexError):
        return np.array([])


test_rmse = evaluate_rmse(test=test, predict_fn=predict_final)
print(f"RMSE: {test_rmse:.4f}")

test_mape = evaluate_mape(test=test, predict_fn=predict_final)
print(f"MAPE: {test_mape:.2f}%")

for k in [5, 10, 20]:
    precision_k = evaluate_precision_at_k(test=test, recommend_k_fn=recommend_k, k=k)
    print(f"\nPrecision@{k}: {precision_k:.4f}")

print("\n" + "=" * 50)
print(f"Best hyperparameters: factors={best_params['factors']}, alpha={best_params['alpha']}")


RMSE: 3.1120
MAPE: 81.38%

Precision@5: 0.0684

Precision@10: 0.0619

Precision@20: 0.0588

Best hyperparameters: factors=40, alpha=60
