# Phase 4: Model Training
## MovieLens 32M Dataset (30% Sample)

**Models to Train:**
1. **Baseline Models** - Popularity, Random, Global Mean
2. **Matrix Factorization** - Custom SVD (NumPy/SciPy)
3. **Content-Based** - Genre similarity
4. **Hybrid** - Weighted combination of MF + Content-Based

**Evaluation Metrics:**
- RMSE, MAE (rating prediction)
- Precision@K, Recall@K, NDCG@K (ranking)
- Coverage, Diversity (beyond accuracy)

In [1]:
import numpy as np
import pandas as pd
import pickle
import os
from scipy.sparse import load_npz, csr_matrix, lil_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import time

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
print("Libraries loaded (no external dependencies needed).")

Libraries loaded (no external dependencies needed).


In [None]:
# ===========================================
# CONFIGURATION
# ===========================================

ML_READY_PATH = 'data/ml_ready'
MODELS_PATH = '/models'

# Evaluation settings
TOP_K = [5, 10, 20]  # For Precision@K, Recall@K, NDCG@K
RELEVANCE_THRESHOLD = 4.0  # Rating >= this is "relevant"

# SVD settings
N_FACTORS = 50  # Number of latent factors (reduced for speed)

os.makedirs(MODELS_PATH, exist_ok=True)

print("CONFIGURATION")
print("=" * 50)
print(f"Data path: {ML_READY_PATH}")
print(f"Models path: {MODELS_PATH}")
print(f"Top-K values: {TOP_K}")
print(f"Relevance threshold: {RELEVANCE_THRESHOLD}")
print(f"SVD factors: {N_FACTORS}")

CONFIGURATION
Data path: D:/Courses/DL INTERNSHIP/THIRD PROJECT/data/ml_ready
Models path: D:/Courses/DL INTERNSHIP/THIRD PROJECT/models
Top-K values: [5, 10, 20]
Relevance threshold: 4.0
SVD factors: 50


---
## 1. Load Preprocessed Data

In [3]:
print("=" * 60)
print("1. LOAD DATA")
print("=" * 60)

# Load splits
train_df = pd.read_parquet(f'{ML_READY_PATH}/train.parquet')
val_df = pd.read_parquet(f'{ML_READY_PATH}/val.parquet')
test_df = pd.read_parquet(f'{ML_READY_PATH}/test.parquet')

print(f"\nTrain: {len(train_df):,} ratings")
print(f"Val: {len(val_df):,} ratings")
print(f"Test: {len(test_df):,} ratings")

1. LOAD DATA

Train: 6,690,428 ratings
Val: 1,437,861 ratings
Test: 1,468,027 ratings


In [4]:
# Load sparse matrices
train_sparse = load_npz(f'{ML_READY_PATH}/train_sparse.npz')

print(f"\nSparse matrix shape: {train_sparse.shape}")
print(f"Density: {train_sparse.nnz / (train_sparse.shape[0] * train_sparse.shape[1]) * 100:.4f}%")


Sparse matrix shape: (60284, 27498)
Density: 0.4036%


In [5]:
# Load mappings and stats
with open(f'{ML_READY_PATH}/mappings.pkl', 'rb') as f:
    mappings = pickle.load(f)

with open(f'{ML_READY_PATH}/stats.pkl', 'rb') as f:
    stats = pickle.load(f)

with open(f'{ML_READY_PATH}/eval_data.pkl', 'rb') as f:
    eval_data = pickle.load(f)

n_users = mappings['n_users']
n_items = mappings['n_items']
global_mean = stats['global_mean']
user_bias = stats['user_bias']
item_bias = stats['item_bias']
item_popularity = stats['item_popularity']

print(f"\nUsers: {n_users:,}")
print(f"Items: {n_items:,}")
print(f"Global mean: {global_mean:.4f}")


Users: 60,284
Items: 27,498
Global mean: 3.5377


In [6]:
# Load genre features
genre_features = np.load(f'{ML_READY_PATH}/genre_features.npy')
genre_names = mappings['genre_names']

print(f"\nGenre features: {genre_features.shape}")
print(f"Genres: {genre_names}")


Genre features: (27498, 19)
Genres: ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [7]:
# Evaluation data
val_user_items = eval_data['val_user_items']
test_user_items = eval_data['test_user_items']
val_relevant = eval_data['val_relevant']
test_relevant = eval_data['test_relevant']
user_positive_items = eval_data['user_positive_items']

print(f"\nVal users: {len(val_user_items):,}")
print(f"Test users: {len(test_user_items):,}")


Val users: 60,284
Test users: 60,284


---
## 2. Evaluation Metrics

In [8]:
def rmse(predictions, actuals):
    """Root Mean Square Error"""
    return np.sqrt(np.mean((np.array(predictions) - np.array(actuals)) ** 2))

def mae(predictions, actuals):
    """Mean Absolute Error"""
    return np.mean(np.abs(np.array(predictions) - np.array(actuals)))

def precision_at_k(recommended, relevant, k):
    """Precision@K"""
    if len(recommended) == 0:
        return 0.0
    recommended_k = set(recommended[:k])
    return len(recommended_k & relevant) / k

def recall_at_k(recommended, relevant, k):
    """Recall@K"""
    if len(relevant) == 0:
        return 0.0
    recommended_k = set(recommended[:k])
    return len(recommended_k & relevant) / len(relevant)

def ndcg_at_k(recommended, relevant, k):
    """Normalized Discounted Cumulative Gain@K"""
    dcg = 0.0
    for i, item in enumerate(recommended[:k]):
        if item in relevant:
            dcg += 1.0 / np.log2(i + 2)
    
    # Ideal DCG
    idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(relevant), k)))
    
    return dcg / idcg if idcg > 0 else 0.0

def coverage(all_recommended, n_items):
    """Catalog coverage - % of items recommended at least once"""
    unique_items = set()
    for items in all_recommended:
        unique_items.update(items)
    return len(unique_items) / n_items * 100

print("Evaluation functions defined.")

Evaluation functions defined.


In [9]:
def evaluate_ranking(model_name, recommendations, relevant_items, ks=[5, 10, 20]):
    """
    Evaluate ranking metrics for a model.
    """
    results = {'model': model_name}
    
    for k in ks:
        precisions = []
        recalls = []
        ndcgs = []
        
        for user_idx, recs in recommendations.items():
            if user_idx in relevant_items and len(relevant_items[user_idx]) > 0:
                rel = relevant_items[user_idx]
                precisions.append(precision_at_k(recs, rel, k))
                recalls.append(recall_at_k(recs, rel, k))
                ndcgs.append(ndcg_at_k(recs, rel, k))
        
        results[f'P@{k}'] = np.mean(precisions) if precisions else 0.0
        results[f'R@{k}'] = np.mean(recalls) if recalls else 0.0
        results[f'NDCG@{k}'] = np.mean(ndcgs) if ndcgs else 0.0
    
    # Coverage
    results['Coverage'] = coverage(list(recommendations.values()), n_items)
    
    return results

print("Ranking evaluation function defined.")

Ranking evaluation function defined.


In [10]:
# Store all results
all_results = []

---
## 3. Baseline Models

### 3.1 Global Mean Baseline

In [11]:
print("=" * 60)
print("3.1 GLOBAL MEAN BASELINE")
print("=" * 60)

# Predict global mean for all
val_predictions_mean = [global_mean] * len(val_df)
val_actuals = val_df['rating'].values

rmse_mean = rmse(val_predictions_mean, val_actuals)
mae_mean = mae(val_predictions_mean, val_actuals)

print(f"\nGlobal Mean: {global_mean:.4f}")
print(f"Val RMSE: {rmse_mean:.4f}")
print(f"Val MAE: {mae_mean:.4f}")

all_results.append({
    'model': 'Global Mean',
    'RMSE': rmse_mean,
    'MAE': mae_mean,
    'P@10': 0.0,
    'R@10': 0.0,
    'NDCG@10': 0.0,
    'Coverage': 0.0
})

3.1 GLOBAL MEAN BASELINE

Global Mean: 3.5377
Val RMSE: 1.0595
Val MAE: 0.8380


### 3.2 User-Item Bias Baseline

In [12]:
print("=" * 60)
print("3.2 USER-ITEM BIAS BASELINE")
print("=" * 60)

def predict_bias(user_idx, item_idx):
    """Predict: global_mean + user_bias + item_bias"""
    u_bias = user_bias.get(user_idx, 0)
    i_bias = item_bias.get(item_idx, 0)
    pred = global_mean + u_bias + i_bias
    return np.clip(pred, 0.5, 5.0)  # Clip to valid range

# Predict for validation
print("Predicting for validation set...")
val_predictions_bias = [
    predict_bias(row['user_idx'], row['item_idx'])
    for _, row in val_df.iterrows()
]

rmse_bias = rmse(val_predictions_bias, val_actuals)
mae_bias = mae(val_predictions_bias, val_actuals)

print(f"\nVal RMSE: {rmse_bias:.4f}")
print(f"Val MAE: {mae_bias:.4f}")
print(f"Improvement over Global Mean: {(rmse_mean - rmse_bias) / rmse_mean * 100:.1f}%")

all_results.append({
    'model': 'User-Item Bias',
    'RMSE': rmse_bias,
    'MAE': mae_bias,
    'P@10': 0.0,
    'R@10': 0.0,
    'NDCG@10': 0.0,
    'Coverage': 0.0
})

3.2 USER-ITEM BIAS BASELINE
Predicting for validation set...

Val RMSE: 0.8775
Val MAE: 0.6657
Improvement over Global Mean: 17.2%


### 3.3 Popularity Baseline

In [13]:
print("=" * 60)
print("3.3 POPULARITY BASELINE")
print("=" * 60)

# Rank items by popularity (number of ratings)
popularity_ranking = sorted(item_popularity.items(), key=lambda x: x[1], reverse=True)
popular_items = [item for item, _ in popularity_ranking]

print(f"\nTop 10 most popular items: {popular_items[:10]}")

3.3 POPULARITY BASELINE

Top 10 most popular items: [314, 351, 292, 2429, 582, 257, 2812, 472, 519, 4785]


In [None]:
# Generate recommendations (same for all users, excluding seen items)
def get_popularity_recommendations(user_idx, popular_items, user_positive, k=20):
    """Recommend popular items user hasn't seen."""
    seen = user_positive.get(user_idx, set())
    recs = [item for item in popular_items if item not in seen][:k]
    return recs

# Generate for all val users
popularity_recs = {}
for user_idx in val_user_items.keys():
    popularity_recs[user_idx] = get_popularity_recommendations(
        user_idx, popular_items, user_positive_items, k=max(TOP_K)
    )

# Evaluate
popularity_results = evaluate_ranking('Popularity', popularity_recs, val_relevant, TOP_K)
popularity_results['RMSE'] = '-'
popularity_results['MAE'] = '-'

print(f"\nPopularity Baseline Results:")
for k in TOP_K:
    print(f"  P@{k}: {popularity_results[f'P@{k}']:.4f}")
    print(f"  R@{k}: {popularity_results[f'R@{k}']:.4f}")
    print(f"  NDCG@{k}: {popularity_results[f'NDCG@{k}']:.4f}")
print(f"  Coverage: {popularity_results['Coverage']:.2f}%")

all_results.append(popularity_results)

Generating popularity recommendations...

Popularity Baseline Results:
  P@5: 0.0967
  R@5: 0.0574
  NDCG@5: 0.1112
  P@10: 0.0819
  R@10: 0.0933
  NDCG@10: 0.1136
  P@20: 0.0655
  R@20: 0.1461
  NDCG@20: 0.1249
  Coverage: 0.36%


---
## 4. Matrix Factorization (SVD)

Using scipy's truncated SVD on the sparse rating matrix.

In [None]:
print("=" * 60)
print("4. MATRIX FACTORIZATION (SVD)")
print("=" * 60)

# Convert to lil_matrix for efficient modification
train_centered = train_sparse.copy().tolil()
train_centered_csr = train_sparse.copy().tocsr()

# Subtract global mean from non-zero entries
train_centered_csr.data -= global_mean

print(f"Matrix shape: {train_centered_csr.shape}")
print(f"Non-zero entries: {train_centered_csr.nnz:,}")

4. MATRIX FACTORIZATION (SVD)

Preparing matrix for SVD...
Matrix shape: (60284, 27498)
Non-zero entries: 6,690,428


In [16]:
# Perform truncated SVD
print(f"\nPerforming SVD with {N_FACTORS} factors...")
start_time = time.time()

# svds returns U, sigma, Vt
U, sigma, Vt = svds(train_centered_csr.astype(np.float32), k=N_FACTORS)

# Convert sigma to diagonal matrix for reconstruction
sigma_diag = np.diag(sigma)

svd_time = time.time() - start_time
print(f"SVD completed in {svd_time:.1f} seconds")
print(f"\nU shape: {U.shape}")
print(f"Sigma: {sigma.shape}")
print(f"Vt shape: {Vt.shape}")


Performing SVD with 50 factors...
SVD completed in 2.0 seconds

U shape: (60284, 50)
Sigma: (50,)
Vt shape: (50, 27498)


In [17]:
# Create user and item latent factors
# User factors: U @ sqrt(sigma)
# Item factors: sqrt(sigma) @ Vt
sigma_sqrt = np.sqrt(sigma)

user_factors = U * sigma_sqrt  # (n_users, n_factors)
item_factors = (Vt.T * sigma_sqrt).T  # (n_factors, n_items) -> transpose to (n_items, n_factors)
item_factors = item_factors.T

print(f"\nUser factors: {user_factors.shape}")
print(f"Item factors: {item_factors.shape}")


User factors: (60284, 50)
Item factors: (27498, 50)


In [18]:
def predict_svd(user_idx, item_idx):
    """Predict rating using SVD factors + biases."""
    # Convert to int explicitly (fixes float index issue)
    user_idx = int(user_idx)
    item_idx = int(item_idx)
    
    # Base prediction from latent factors
    if user_idx < len(user_factors) and item_idx < len(item_factors):
        latent_pred = np.dot(user_factors[user_idx], item_factors[item_idx])
    else:
        latent_pred = 0
    
    # Add global mean and biases
    u_bias = user_bias.get(user_idx, 0)
    i_bias = item_bias.get(item_idx, 0)
    
    pred = global_mean + u_bias + i_bias + latent_pred
    return np.clip(pred, 0.5, 5.0)

print("SVD prediction function defined.")

SVD prediction function defined.


In [19]:
# Evaluate on validation set - Rating Prediction
print("\nEvaluating rating prediction on validation set...")

val_predictions_svd = []
for i, (_, row) in enumerate(val_df.iterrows()):
    if i % 200000 == 0:
        print(f"  Progress: {i:,}/{len(val_df):,}")
    # Convert to int explicitly
    user_idx = int(row['user_idx'])
    item_idx = int(row['item_idx'])
    pred = predict_svd(user_idx, item_idx)
    val_predictions_svd.append(pred)

rmse_svd = rmse(val_predictions_svd, val_actuals)
mae_svd = mae(val_predictions_svd, val_actuals)

print(f"\nSVD Results (Rating Prediction):")
print(f"  Val RMSE: {rmse_svd:.4f}")
print(f"  Val MAE: {mae_svd:.4f}")
print(f"  Improvement over Bias: {(rmse_bias - rmse_svd) / rmse_bias * 100:.1f}%")


Evaluating rating prediction on validation set...
  Progress: 0/1,437,861
  Progress: 200,000/1,437,861
  Progress: 400,000/1,437,861
  Progress: 600,000/1,437,861
  Progress: 800,000/1,437,861
  Progress: 1,000,000/1,437,861
  Progress: 1,200,000/1,437,861
  Progress: 1,400,000/1,437,861

SVD Results (Rating Prediction):
  Val RMSE: 0.8707
  Val MAE: 0.6568
  Improvement over Bias: 0.8%


In [20]:
# Precompute all predicted ratings for faster recommendation
print("\nPrecomputing predicted ratings matrix...")
start_time = time.time()

# Compute full prediction matrix: U @ sigma @ Vt + global_mean
# This is memory intensive but much faster for recommendations
predicted_ratings = np.dot(user_factors, item_factors.T)  # (n_users, n_items)

# Add biases
user_bias_array = np.array([user_bias.get(i, 0) for i in range(n_users)])
item_bias_array = np.array([item_bias.get(i, 0) for i in range(n_items)])

predicted_ratings += global_mean
predicted_ratings += user_bias_array.reshape(-1, 1)
predicted_ratings += item_bias_array.reshape(1, -1)

# Clip
predicted_ratings = np.clip(predicted_ratings, 0.5, 5.0)

print(f"Prediction matrix shape: {predicted_ratings.shape}")
print(f"Computation time: {time.time() - start_time:.1f} seconds")
print(f"Memory: {predicted_ratings.nbytes / 1024**3:.2f} GB")


Precomputing predicted ratings matrix...
Prediction matrix shape: (60284, 27498)
Computation time: 18.0 seconds
Memory: 6.18 GB


In [21]:
# Generate SVD recommendations
print("\nGenerating SVD recommendations...")

def get_svd_recommendations(user_idx, predicted_ratings, user_positive, k=20):
    """Get top-k recommendations for a user using precomputed SVD predictions."""
    seen = user_positive.get(user_idx, set())
    
    # Get predictions for this user
    user_preds = predicted_ratings[user_idx].copy()
    
    # Set seen items to -inf so they won't be recommended
    for item in seen:
        if item < len(user_preds):
            user_preds[item] = -np.inf
    
    # Get top-k indices
    top_k_items = np.argsort(user_preds)[-k:][::-1]
    
    return top_k_items.tolist()

# Sample users for efficiency
sample_users = list(val_relevant.keys())[:3000]  # Sample 3000 users

svd_recs = {}
for i, user_idx in enumerate(sample_users):
    if i % 1000 == 0:
        print(f"  Processing user {i}/{len(sample_users)}")
    svd_recs[user_idx] = get_svd_recommendations(
        user_idx, predicted_ratings, user_positive_items, k=max(TOP_K)
    )

print(f"\nGenerated recommendations for {len(svd_recs)} users")


Generating SVD recommendations...
  Processing user 0/3000
  Processing user 1000/3000
  Processing user 2000/3000

Generated recommendations for 3000 users


In [22]:
# Evaluate ranking
svd_results = evaluate_ranking('SVD', svd_recs, val_relevant, TOP_K)
svd_results['RMSE'] = rmse_svd
svd_results['MAE'] = mae_svd

print(f"\nSVD Results (Ranking):")
for k in TOP_K:
    print(f"  P@{k}: {svd_results[f'P@{k}']:.4f}")
    print(f"  R@{k}: {svd_results[f'R@{k}']:.4f}")
    print(f"  NDCG@{k}: {svd_results[f'NDCG@{k}']:.4f}")
print(f"  Coverage: {svd_results['Coverage']:.2f}%")

all_results.append(svd_results)


SVD Results (Ranking):
  P@5: 0.0111
  R@5: 0.0034
  NDCG@5: 0.0128
  P@10: 0.0121
  R@10: 0.0073
  NDCG@10: 0.0137
  P@20: 0.0115
  R@20: 0.0133
  NDCG@20: 0.0150
  Coverage: 6.77%


---
## 5. Content-Based Filtering (Genre Similarity)

In [23]:
print("=" * 60)
print("5. CONTENT-BASED FILTERING")
print("=" * 60)

# Compute item-item similarity based on genres
print("\nComputing genre similarity matrix...")
start_time = time.time()

# Normalize genre features (L2 norm)
genre_norms = np.linalg.norm(genre_features, axis=1, keepdims=True)
genre_norms[genre_norms == 0] = 1  # Avoid division by zero
genre_features_normalized = genre_features / genre_norms

# Compute cosine similarity
item_similarity = cosine_similarity(genre_features_normalized)

print(f"Similarity matrix shape: {item_similarity.shape}")
print(f"Computation time: {time.time() - start_time:.1f} seconds")

5. CONTENT-BASED FILTERING

Computing genre similarity matrix...
Similarity matrix shape: (27498, 27498)
Computation time: 3.1 seconds


In [24]:
# Build user-item rating lookup from training data
print("\nBuilding user-item rating lookup...")
user_item_ratings = defaultdict(dict)
for _, row in train_df.iterrows():
    user_item_ratings[row['user_idx']][row['item_idx']] = row['rating']

print(f"Built rating lookup for {len(user_item_ratings)} users")


Building user-item rating lookup...
Built rating lookup for 60284 users


In [None]:
def get_content_recommendations(user_idx, user_positive, item_similarity, user_item_ratings, k=20):
    """
    Content-based recommendations using genre similarity.
    """
    seen = user_positive.get(user_idx, set())
    if len(seen) == 0:
        return []
    
    # Get items the user liked (rating >= threshold)
    liked_items = []
    for item_idx in seen:
        if item_idx in user_item_ratings.get(user_idx, {}):
            if user_item_ratings[user_idx][item_idx] >= RELEVANCE_THRESHOLD:
                liked_items.append(item_idx)
    
    if len(liked_items) == 0:
        liked_items = list(seen)[:10]  # Fall back to some seen items
    
    # Score unseen items by average similarity to liked items
    n_items = item_similarity.shape[0]
    scores = np.zeros(n_items)
    
    for liked in liked_items:
        if liked < n_items:
            scores += item_similarity[liked]
    
    scores /= len(liked_items)
    
    # Set seen items to -inf
    for item in seen:
        if item < n_items:
            scores[item] = -np.inf
    
    # Get top-k
    top_k = np.argsort(scores)[-k:][::-1]
    
    return top_k.tolist()



Content-based recommendation function defined.


In [None]:
cb_recs = {}
for i, user_idx in enumerate(sample_users):
    if i % 1000 == 0:
        print(f"  Processing user {i}/{len(sample_users)}")
    cb_recs[user_idx] = get_content_recommendations(
        user_idx, user_positive_items, item_similarity, user_item_ratings, k=max(TOP_K)
    )

print(f"\nGenerated recommendations for {len(cb_recs)} users")


Generating content-based recommendations...
  Processing user 0/3000
  Processing user 1000/3000
  Processing user 2000/3000

Generated recommendations for 3000 users


In [27]:
# Evaluate
cb_results = evaluate_ranking('Content-Based', cb_recs, val_relevant, TOP_K)
cb_results['RMSE'] = '-'
cb_results['MAE'] = '-'

print(f"\nContent-Based Results:")
for k in TOP_K:
    print(f"  P@{k}: {cb_results[f'P@{k}']:.4f}")
    print(f"  R@{k}: {cb_results[f'R@{k}']:.4f}")
    print(f"  NDCG@{k}: {cb_results[f'NDCG@{k}']:.4f}")
print(f"  Coverage: {cb_results['Coverage']:.2f}%")

all_results.append(cb_results)


Content-Based Results:
  P@5: 0.0017
  R@5: 0.0008
  NDCG@5: 0.0017
  P@10: 0.0022
  R@10: 0.0028
  NDCG@10: 0.0025
  P@20: 0.0022
  R@20: 0.0053
  NDCG@20: 0.0035
  Coverage: 35.75%


---
## 6. Hybrid Model (SVD + Content-Based)

In [None]:
def get_hybrid_recommendations(user_idx, predicted_ratings, user_positive, 
                                item_similarity, user_item_ratings, alpha=0.7, k=20):
    """
    Hybrid recommendations combining SVD and content-based.
    
    Score = alpha * SVD_score + (1-alpha) * CB_score
    """
    seen = user_positive.get(user_idx, set())
    n_items_local = min(predicted_ratings.shape[1], item_similarity.shape[0])
    
    # SVD scores (normalize to 0-1)
    svd_scores = predicted_ratings[user_idx, :n_items_local].copy()
    svd_scores = (svd_scores - 0.5) / 4.5  # Normalize from [0.5, 5] to [0, 1]
    
    # Content-based scores
    liked_items = []
    for item_idx in seen:
        if item_idx in user_item_ratings.get(user_idx, {}):
            if user_item_ratings[user_idx][item_idx] >= RELEVANCE_THRESHOLD:
                liked_items.append(item_idx)
    
    if len(liked_items) == 0:
        liked_items = [item for item in seen if item < n_items_local][:10]
    
    cb_scores = np.zeros(n_items_local)
    if len(liked_items) > 0:
        for liked in liked_items:
            if liked < n_items_local:
                cb_scores += item_similarity[liked, :n_items_local]
        cb_scores /= len(liked_items)
    
    # Combine
    hybrid_scores = alpha * svd_scores + (1 - alpha) * cb_scores
    
    # Set seen items to -inf
    for item in seen:
        if item < n_items_local:
            hybrid_scores[item] = -np.inf
    
    # Get top-k
    top_k = np.argsort(hybrid_scores)[-k:][::-1]
    
    return top_k.tolist()



6. HYBRID MODEL
Hybrid recommendation function defined.


In [29]:
# Test different alpha values
alphas = [0.3, 0.5, 0.7, 0.9]
best_alpha = 0.7
best_ndcg = 0.0

print("\nTuning hybrid alpha on validation set...")

tune_users = sample_users[:1000]  # Smaller sample for tuning

for alpha in alphas:
    hybrid_recs_tune = {}
    for user_idx in tune_users:
        hybrid_recs_tune[user_idx] = get_hybrid_recommendations(
            user_idx, predicted_ratings, user_positive_items, 
            item_similarity, user_item_ratings, alpha=alpha, k=10
        )
    
    # Evaluate
    results = evaluate_ranking(f'Hybrid(α={alpha})', hybrid_recs_tune, val_relevant, [10])
    ndcg = results['NDCG@10']
    print(f"  α={alpha}: NDCG@10={ndcg:.4f}")
    
    if ndcg > best_ndcg:
        best_ndcg = ndcg
        best_alpha = alpha

print(f"\nBest alpha: {best_alpha}")


Tuning hybrid alpha on validation set...
  α=0.3: NDCG@10=0.0274
  α=0.5: NDCG@10=0.0366
  α=0.7: NDCG@10=0.0330
  α=0.9: NDCG@10=0.0271

Best alpha: 0.5


In [30]:
# Generate hybrid recommendations with best alpha
print(f"\nGenerating hybrid recommendations (α={best_alpha})...")

hybrid_recs = {}
for i, user_idx in enumerate(sample_users):
    if i % 1000 == 0:
        print(f"  Processing user {i}/{len(sample_users)}")
    hybrid_recs[user_idx] = get_hybrid_recommendations(
        user_idx, predicted_ratings, user_positive_items,
        item_similarity, user_item_ratings, alpha=best_alpha, k=max(TOP_K)
    )

print(f"\nGenerated recommendations for {len(hybrid_recs)} users")


Generating hybrid recommendations (α=0.5)...
  Processing user 0/3000
  Processing user 1000/3000
  Processing user 2000/3000

Generated recommendations for 3000 users


In [31]:
# Evaluate hybrid
hybrid_results = evaluate_ranking(f'Hybrid (α={best_alpha})', hybrid_recs, val_relevant, TOP_K)
hybrid_results['RMSE'] = '-'
hybrid_results['MAE'] = '-'

print(f"\nHybrid Results (α={best_alpha}):")
for k in TOP_K:
    print(f"  P@{k}: {hybrid_results[f'P@{k}']:.4f}")
    print(f"  R@{k}: {hybrid_results[f'R@{k}']:.4f}")
    print(f"  NDCG@{k}: {hybrid_results[f'NDCG@{k}']:.4f}")
print(f"  Coverage: {hybrid_results['Coverage']:.2f}%")

all_results.append(hybrid_results)


Hybrid Results (α=0.5):
  P@5: 0.0343
  R@5: 0.0167
  NDCG@5: 0.0397
  P@10: 0.0266
  R@10: 0.0271
  NDCG@10: 0.0376
  P@20: 0.0211
  R@20: 0.0404
  NDCG@20: 0.0397
  Coverage: 5.73%


---
## 7. Final Evaluation on Test Set

In [None]:
svd_test_recs = {}
for user_idx in test_sample_users:
    svd_test_recs[user_idx] = get_svd_recommendations(
        user_idx, predicted_ratings, user_positive_items, k=max(TOP_K)
    )

svd_test_results = evaluate_ranking('SVD', svd_test_recs, test_relevant, TOP_K)


SVD on test set...


In [None]:
hybrid_test_recs = {}
for user_idx in test_sample_users:
    hybrid_test_recs[user_idx] = get_hybrid_recommendations(
        user_idx, predicted_ratings, user_positive_items,
        item_similarity, user_item_ratings, alpha=best_alpha, k=max(TOP_K)
    )

hybrid_test_results = evaluate_ranking(f'Hybrid (α={best_alpha})', hybrid_test_recs, test_relevant, TOP_K)


Hybrid on test set...


In [None]:
test_actuals = test_df['rating'].values

test_predictions_svd = []
for i, (_, row) in enumerate(test_df.iterrows()):
    if i % 200000 == 0:
        print(f"  Progress: {i:,}/{len(test_df):,}")
    pred = predict_svd(row['user_idx'], row['item_idx'])
    test_predictions_svd.append(pred)

rmse_svd_test = rmse(test_predictions_svd, test_actuals)
mae_svd_test = mae(test_predictions_svd, test_actuals)

print(f"\nSVD Test Set Rating Prediction:")
print(f"  RMSE: {rmse_svd_test:.4f}")
print(f"  MAE: {mae_svd_test:.4f}")


Calculating test RMSE...
  Progress: 0/1,468,027
  Progress: 200,000/1,468,027
  Progress: 400,000/1,468,027
  Progress: 600,000/1,468,027
  Progress: 800,000/1,468,027
  Progress: 1,000,000/1,468,027
  Progress: 1,200,000/1,468,027
  Progress: 1,400,000/1,468,027

SVD Test Set Rating Prediction:
  RMSE: 0.8727
  MAE: 0.6576


In [36]:
# Print test results comparison
print("\n" + "=" * 60)
print("TEST SET RESULTS")
print("=" * 60)

print(f"\n{'Model':<20} {'NDCG@10':>10} {'P@10':>10} {'R@10':>10} {'Coverage':>10}")
print("-" * 62)
print(f"{'SVD':<20} {svd_test_results['NDCG@10']:>10.4f} {svd_test_results['P@10']:>10.4f} {svd_test_results['R@10']:>10.4f} {svd_test_results['Coverage']:>9.2f}%")
print(f"{'Hybrid':<20} {hybrid_test_results['NDCG@10']:>10.4f} {hybrid_test_results['P@10']:>10.4f} {hybrid_test_results['R@10']:>10.4f} {hybrid_test_results['Coverage']:>9.2f}%")


TEST SET RESULTS

Model                   NDCG@10       P@10       R@10   Coverage
--------------------------------------------------------------
SVD                      0.0152     0.0139     0.0077      6.77%
Hybrid                   0.0399     0.0284     0.0282      5.67%


---
## 8. Save Models and Results

In [37]:
print("=" * 60)
print("8. SAVE MODELS AND RESULTS")
print("=" * 60)

# Save SVD components
svd_model = {
    'user_factors': user_factors,
    'item_factors': item_factors,
    'global_mean': global_mean,
    'user_bias': user_bias,
    'item_bias': item_bias,
    'n_factors': N_FACTORS
}

with open(f'{MODELS_PATH}/svd_model.pkl', 'wb') as f:
    pickle.dump(svd_model, f)
print(f"\nSVD model saved to {MODELS_PATH}/svd_model.pkl")

# Save item similarity
np.save(f'{MODELS_PATH}/item_similarity.npy', item_similarity)
print(f"Item similarity saved to {MODELS_PATH}/item_similarity.npy")

# Save best alpha
hybrid_config = {
    'best_alpha': best_alpha,
    'relevance_threshold': RELEVANCE_THRESHOLD
}
with open(f'{MODELS_PATH}/hybrid_config.pkl', 'wb') as f:
    pickle.dump(hybrid_config, f)
print(f"Hybrid config saved to {MODELS_PATH}/hybrid_config.pkl")

8. SAVE MODELS AND RESULTS

SVD model saved to D:/Courses/DL INTERNSHIP/THIRD PROJECT/models/svd_model.pkl
Item similarity saved to D:/Courses/DL INTERNSHIP/THIRD PROJECT/models/item_similarity.npy
Hybrid config saved to D:/Courses/DL INTERNSHIP/THIRD PROJECT/models/hybrid_config.pkl


In [38]:
# Create and save results dataframe
results_df = pd.DataFrame(all_results)

# Reorder columns
cols = ['model', 'RMSE', 'MAE'] + [f'P@{k}' for k in TOP_K] + [f'R@{k}' for k in TOP_K] + [f'NDCG@{k}' for k in TOP_K] + ['Coverage']
cols = [c for c in cols if c in results_df.columns]
results_df = results_df[cols]

print("\n" + "=" * 100)
print("VALIDATION SET RESULTS SUMMARY")
print("=" * 100)
print(results_df.to_string(index=False))

# Save
results_df.to_csv(f'{MODELS_PATH}/validation_results.csv', index=False)
print(f"\nResults saved to {MODELS_PATH}/validation_results.csv")


VALIDATION SET RESULTS SUMMARY
         model      RMSE       MAE      P@5     P@10     P@20      R@5     R@10     R@20   NDCG@5  NDCG@10  NDCG@20  Coverage
   Global Mean  1.059519  0.838023      NaN 0.000000      NaN      NaN 0.000000      NaN      NaN 0.000000      NaN  0.000000
User-Item Bias  0.877511   0.66571      NaN 0.000000      NaN      NaN 0.000000      NaN      NaN 0.000000      NaN  0.000000
    Popularity         -         - 0.096741 0.081945 0.065525 0.057444 0.093266 0.146111 0.111190 0.113555 0.124933  0.356390
           SVD  0.870696  0.656772 0.011133 0.012133 0.011500 0.003365 0.007294 0.013347 0.012771 0.013742 0.015039  6.771402
 Content-Based         -         - 0.001733 0.002167 0.002250 0.000820 0.002782 0.005308 0.001718 0.002493 0.003504 35.751691
Hybrid (α=0.5)         -         - 0.034267 0.026600 0.021117 0.016665 0.027098 0.040356 0.039655 0.037567 0.039721  5.727689

Results saved to D:/Courses/DL INTERNSHIP/THIRD PROJECT/models/validation_results.csv

---
## Phase 4 Summary

In [None]:
print("=" * 70)
print("PHASE 4 SUMMARY: MODEL TRAINING")
print("=" * 70)

print("\n" + "-" * 70)
print("MODELS TRAINED")
print("-" * 70)
print("  1. Global Mean Baseline")
print("  2. User-Item Bias Baseline")
print("  3. Popularity Baseline")
print(f"  4. Matrix Factorization (SVD, {N_FACTORS} factors)")
print("  5. Content-Based (Genre Similarity)")
print(f"  6. Hybrid (α={best_alpha})")

print("\n" + "-" * 70)
print("BEST MODEL PERFORMANCE (Validation)")
print("-" * 70)

# Find best model for each metric
numeric_results = results_df[results_df['RMSE'] != '-'].copy()
if len(numeric_results) > 0:
    numeric_results['RMSE'] = numeric_results['RMSE'].astype(float)
    best_rmse_model = numeric_results.loc[numeric_results['RMSE'].idxmin(), 'model']
    best_rmse_val = numeric_results['RMSE'].min()
    print(f"  Best RMSE: {best_rmse_val:.4f} ({best_rmse_model})")

best_ndcg_idx = results_df['NDCG@10'].astype(float).idxmax()
best_ndcg_model = results_df.loc[best_ndcg_idx, 'model']
best_ndcg_val = results_df.loc[best_ndcg_idx, 'NDCG@10']
print(f"  Best NDCG@10: {best_ndcg_val:.4f} ({best_ndcg_model})")

best_prec_idx = results_df['P@10'].astype(float).idxmax()
best_prec_model = results_df.loc[best_prec_idx, 'model']
best_prec_val = results_df.loc[best_prec_idx, 'P@10']
print(f"  Best P@10: {best_prec_val:.4f} ({best_prec_model})")

print("\n" + "-" * 70)
print("TEST SET PERFORMANCE")
print("-" * 70)
print(f"  SVD RMSE: {rmse_svd_test:.4f}")
print(f"  SVD NDCG@10: {svd_test_results['NDCG@10']:.4f}")
print(f"  Hybrid NDCG@10: {hybrid_test_results['NDCG@10']:.4f}")

print("\n" + "-" * 70)
print("SAVED ARTIFACTS")
print("-" * 70)
print(f"  Location: {MODELS_PATH}")
for f in os.listdir(MODELS_PATH):
    size = os.path.getsize(f'{MODELS_PATH}/{f}') / 1024**2
    print(f"    - {f}: {size:.2f} MB")



PHASE 4 SUMMARY: MODEL TRAINING

----------------------------------------------------------------------
MODELS TRAINED
----------------------------------------------------------------------
  1. Global Mean Baseline
  2. User-Item Bias Baseline
  3. Popularity Baseline
  4. Matrix Factorization (SVD, 50 factors)
  5. Content-Based (Genre Similarity)
  6. Hybrid (α=0.5)

----------------------------------------------------------------------
BEST MODEL PERFORMANCE (Validation)
----------------------------------------------------------------------
  Best RMSE: 0.8707 (SVD)
  Best NDCG@10: 0.1136 (Popularity)
  Best P@10: 0.0819 (Popularity)

----------------------------------------------------------------------
TEST SET PERFORMANCE
----------------------------------------------------------------------
  SVD RMSE: 0.8727
  SVD NDCG@10: 0.0152
  Hybrid NDCG@10: 0.0399

----------------------------------------------------------------------
SAVED ARTIFACTS
------------------------------------