In [1]:
# Import required libraries for recommendation systems
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

# Surprise library for collaborative filtering
from surprise import Dataset, Reader, SVD, NMF, KNNBasic, accuracy
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

# Note: Make sure you have the following packages installed in your environment:
# pip install scikit-surprise numpy pandas scipy
print("Libraries imported successfully!")
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

Libraries imported successfully!
Numpy version: 1.26.4
Pandas version: 2.3.0+4.g1dfc98e16a


In [2]:
# Load the MovieLens 100k dataset
# Force download by using the download parameter directly
import os
import sys
from unittest.mock import patch

print("Loading MovieLens 100k dataset...")

# Use mock to automatically respond 'Y' to the download prompt
with patch('builtins.input', return_value='Y'):
    try:
        data = Dataset.load_builtin('ml-100k')
        print("✅ MovieLens 100k dataset loaded successfully!")
        print(f"📊 Dataset contains {len(data.raw_ratings):,} ratings")
        print(f"📁 Dataset saved to: ~/.surprise_data/ml-100k/")
        
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        print("Make sure scikit-surprise is installed: pip install scikit-surprise")

Loading MovieLens 100k dataset...
✅ MovieLens 100k dataset loaded successfully!
📊 Dataset contains 100,000 ratings
📁 Dataset saved to: ~/.surprise_data/ml-100k/


In [3]:
data.raw_ratings[:10]
# (user_id, movie_id, rating, timestamp)

[('196', '242', 3.0, '881250949'),
 ('186', '302', 3.0, '891717742'),
 ('22', '377', 1.0, '878887116'),
 ('244', '51', 2.0, '880606923'),
 ('166', '346', 1.0, '886397596'),
 ('298', '474', 4.0, '884182806'),
 ('115', '265', 2.0, '881171488'),
 ('253', '465', 5.0, '891628467'),
 ('305', '451', 3.0, '886324817'),
 ('6', '86', 3.0, '883603013')]

In [5]:
# Split data into train and test sets (75% train, 25% test)
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Trainset objects don't have len(), but they have n_ratings attribute
print(f"Training set: {trainset.n_ratings} ratings")
print(f"Test set: {len(testset)} ratings")

# Additional information about the split
print(f"Training set info:")
print(f"  - Number of users: {trainset.n_users}")
print(f"  - Number of items: {trainset.n_items}")
print(f"  - Rating scale: {trainset.rating_scale}")
print(f"  - Global mean rating: {trainset.global_mean:.2f}")

Training set: 75000 ratings
Test set: 25000 ratings
Training set info:
  - Number of users: 943
  - Number of items: 1644
  - Rating scale: (1, 5)
  - Global mean rating: 3.53


In [6]:
# Access trainset object which contains user-item interactions
trainset = data.build_full_trainset()

# Total number of users and items
n_users = trainset.n_users
n_items = trainset.n_items


In [7]:
# Display information about the training set
print(f"Training set information:")
print(f"Number of users: {trainset.n_users}")
print(f"Number of items: {trainset.n_items}")
print(f"Number of ratings: {trainset.n_ratings}")
print(f"Rating scale: {trainset.rating_scale}")
print(f"Global mean rating: {trainset.global_mean:.2f}")

# Show some sample user-item interactions
sample_interactions = list(trainset.all_ratings())[:10]
print(f"\nSample interactions (user_id, item_id, rating):")
for interaction in sample_interactions:
    print(f"User {interaction[0]}, Item {interaction[1]}, Rating {interaction[2]}")

Training set information:
Number of users: 943
Number of items: 1682
Number of ratings: 100000
Rating scale: (1, 5)
Global mean rating: 3.53

Sample interactions (user_id, item_id, rating):
User 0, Item 0, Rating 3.0
User 0, Item 528, Rating 4.0
User 0, Item 377, Rating 4.0
User 0, Item 522, Rating 3.0
User 0, Item 431, Rating 5.0
User 0, Item 834, Rating 5.0
User 0, Item 380, Rating 4.0
User 0, Item 329, Rating 4.0
User 0, Item 550, Rating 5.0
User 0, Item 83, Rating 4.0


In [8]:
# Create user-item interaction matrix (sparse format for memory efficiency)
print("Creating user-item interaction matrix...")

# Initialize data arrays for sparse matrix creation
row_indices = []
col_indices = []
ratings = []

# Collect all user-item-rating triplets
for (u, i, rating) in trainset.all_ratings():
    row_indices.append(u)
    col_indices.append(i)
    ratings.append(rating)

# Create sparse matrix
interaction_matrix = csr_matrix((ratings, (row_indices, col_indices)), 
                               shape=(n_users, n_items))

print(f"Interaction matrix shape: {interaction_matrix.shape}")
print(f"Number of non-zero entries: {interaction_matrix.nnz}")
print(f"Sparsity: {(1 - interaction_matrix.nnz / (n_users * n_items)) * 100:.2f}%")

# Display a small portion of the matrix (first 10x10)
print(f"\nFirst 10x10 portion of the interaction matrix:")
print(interaction_matrix[:10, :10].toarray())


Creating user-item interaction matrix...
Interaction matrix shape: (943, 1682)
Number of non-zero entries: 100000
Sparsity: 93.70%

First 10x10 portion of the interaction matrix:
[[3. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 3. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 3. 0. 4. 0.]
 [0. 0. 0. 2. 0. 0. 4. 0. 4. 4.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 4. 4. 4. 0. 0.]
 [0. 4. 0. 0. 0. 0. 2. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 5. 0. 0.]
 [5. 4. 0. 0. 0. 5. 0. 0. 3. 4.]
 [4. 4. 0. 0. 0. 5. 0. 1. 0. 3.]]


In [9]:
# Interaction matrix analysis
print(f"Interaction matrix shape: {interaction_matrix.shape}")
print(f"Matrix dimensions: {n_users} users × {n_items} items")
print(f"Total possible interactions: {n_users * n_items:,}")
print(f"Actual interactions: {interaction_matrix.nnz:,}")
print(f"Matrix density: {(interaction_matrix.nnz / (n_users * n_items)) * 100:.4f}%")

# Memory usage estimation
memory_dense = n_users * n_items * 8  # 8 bytes per float64
memory_sparse = interaction_matrix.data.nbytes + interaction_matrix.indices.nbytes + interaction_matrix.indptr.nbytes
print(f"\nMemory usage:")
print(f"Dense matrix would use: {memory_dense / 1024**2:.2f} MB")
print(f"Sparse matrix uses: {memory_sparse / 1024**2:.2f} MB")
print(f"Memory saving: {((memory_dense - memory_sparse) / memory_dense) * 100:.2f}%")

Interaction matrix shape: (943, 1682)
Matrix dimensions: 943 users × 1682 items
Total possible interactions: 1,586,126
Actual interactions: 100,000
Matrix density: 6.3047%

Memory usage:
Dense matrix would use: 12.10 MB
Sparse matrix uses: 1.15 MB
Memory saving: 90.51%


In [10]:
# SVD (Singular Value Decomposition) Collaborative Filtering
print("Training SVD model...")

# Initialize the SVD algorithm with default parameters
algo = SVD(random_state=42)

# Train the algorithm on the trainset
algo.fit(trainset)
print("SVD model training completed!")

# Predict ratings for the testset
print("Making predictions on test set...")
predictions = algo.test(testset)

# Evaluate the model using RMSE and MAE
print("\nModel Evaluation:")
rmse_score = accuracy.rmse(predictions, verbose=True)
mae_score = accuracy.mae(predictions, verbose=True)

print(f"\nSummary:")
print(f"RMSE: {rmse_score:.4f}")
print(f"MAE: {mae_score:.4f}")


Training SVD model...
SVD model training completed!
Making predictions on test set...

Model Evaluation:
RMSE: 0.6757
MAE:  0.5339

Summary:
RMSE: 0.6757
MAE: 0.5339


In [11]:
# Display sample predictions
print("Sample predictions (User ID, Item ID, Actual Rating, Predicted Rating, Details):")
print("-" * 80)

for i, pred in enumerate(predictions[:10]):
    print(f"{i+1:2d}. User {pred.uid:>3}, Item {pred.iid:>3}, "
          f"Actual: {pred.r_ui:>3.1f}, Predicted: {pred.est:>5.2f}, "
          f"Error: {abs(pred.r_ui - pred.est):>5.2f}")

# Calculate prediction statistics
actual_ratings = [pred.r_ui for pred in predictions]
predicted_ratings = [pred.est for pred in predictions]

print(f"\nPrediction Statistics:")
print(f"Actual ratings - Mean: {np.mean(actual_ratings):.2f}, Std: {np.std(actual_ratings):.2f}")
print(f"Predicted ratings - Mean: {np.mean(predicted_ratings):.2f}, Std: {np.std(predicted_ratings):.2f}")
print(f"Min predicted rating: {min(predicted_ratings):.2f}")
print(f"Max predicted rating: {max(predicted_ratings):.2f}")

Sample predictions (User ID, Item ID, Actual Rating, Predicted Rating, Details):
--------------------------------------------------------------------------------
 1. User 391, Item 591, Actual: 4.0, Predicted:  3.54, Error:  0.46
 2. User 181, Item 1291, Actual: 1.0, Predicted:  1.00, Error:  0.00
 3. User 637, Item 268, Actual: 2.0, Predicted:  2.42, Error:  0.42
 4. User 332, Item 451, Actual: 5.0, Predicted:  4.45, Error:  0.55
 5. User 271, Item 204, Actual: 4.0, Predicted:  3.74, Error:  0.26
 6. User  27, Item 286, Actual: 3.0, Predicted:  3.43, Error:  0.43
 7. User 387, Item 663, Actual: 4.0, Predicted:  4.08, Error:  0.08
 8. User  92, Item 722, Actual: 3.0, Predicted:  2.91, Error:  0.09
 9. User 820, Item 347, Actual: 4.0, Predicted:  3.43, Error:  0.57
10. User 479, Item 1444, Actual: 1.0, Predicted:  2.04, Error:  1.04

Prediction Statistics:
Actual ratings - Mean: 3.53, Std: 1.13
Predicted ratings - Mean: 3.53, Std: 0.73
Min predicted rating: 1.00
Max predicted rating: 5.

In [12]:
# Predict the rating for a specific user and item
user_id = str(196)  # user_id should be a string
item_id = str(302)  # item_id should be a string

print(f"Making prediction for User {user_id} and Item {item_id}:")
predicted_rating = algo.predict(user_id, item_id)

print(f"\nPrediction Details:")
print(f"User ID: {predicted_rating.uid}")
print(f"Item ID: {predicted_rating.iid}")
print(f"Predicted Rating: {predicted_rating.est:.3f}")
print(f"Prediction Impossible: {predicted_rating.details['was_impossible']}")

# Check if this user-item pair was in training data
try:
    actual_rating = trainset.to_raw_uid(trainset.to_inner_uid(user_id))
    print(f"User {user_id} exists in training data")
except:
    print(f"User {user_id} is new (not in training data)")

try:
    actual_item = trainset.to_raw_iid(trainset.to_inner_iid(item_id))
    print(f"Item {item_id} exists in training data")
except:
    print(f"Item {item_id} is new (not in training data)")


Making prediction for User 196 and Item 302:

Prediction Details:
User ID: 196
Item ID: 302
Predicted Rating: 4.043
Prediction Impossible: False
User 196 exists in training data
Item 302 exists in training data


In [13]:
# KNN-based Collaborative Filtering (Item-based)
print("Training KNN model with item-based collaborative filtering...")

# Configure similarity options for item-based filtering
sim_options = {
    'name': 'cosine',        # similarity metric: cosine similarity
    'user_based': False,     # False = item-based filtering, True = user-based
    'min_support': 5,        # minimum number of common items for similarity calculation
    'shrinkage': 100         # shrinkage parameter to avoid overfitting
}

# Initialize the KNN algorithm
knn_algo = KNNBasic(k=40, sim_options=sim_options, random_state=42)
print("Computing item-item similarity matrix...")

# Train the algorithm (this will compute the similarity matrix)
knn_algo.fit(trainset)
print("KNN model training completed!")

# Test the algorithm on the testset
print("Making predictions with KNN model...")
knn_predictions = knn_algo.test(testset)

# Evaluate the KNN model
print("\nKNN Model Evaluation:")
knn_rmse = accuracy.rmse(knn_predictions, verbose=True)
knn_mae = accuracy.mae(knn_predictions, verbose=True)

print(f"\nKNN Summary:")
print(f"RMSE: {knn_rmse:.4f}")
print(f"MAE: {knn_mae:.4f}")

# Compare with SVD
print(f"\nModel Comparison:")
print(f"SVD  - RMSE: {rmse_score:.4f}, MAE: {mae_score:.4f}")
print(f"KNN  - RMSE: {knn_rmse:.4f}, MAE: {knn_mae:.4f}")


Training KNN model with item-based collaborative filtering...
Computing item-item similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
KNN model training completed!
Making predictions with KNN model...

KNN Model Evaluation:
RMSE: 0.8887
MAE:  0.6891

KNN Summary:
RMSE: 0.8887
MAE: 0.6891

Model Comparison:
SVD  - RMSE: 0.6757, MAE: 0.5339
KNN  - RMSE: 0.8887, MAE: 0.6891


In [14]:
# Hyperparameter Tuning with Grid Search
print("Starting hyperparameter tuning for SVD...")
print("This may take several minutes...")

# Define a smaller parameter grid for faster execution
param_grid = {
    'n_factors': [50, 100],      # Number of latent factors
    'n_epochs': [20, 30],        # Number of training epochs
    'lr_all': [0.002, 0.005],    # Learning rate
    'reg_all': [0.02, 0.1]       # Regularization parameter
}

print(f"Testing {len(param_grid['n_factors']) * len(param_grid['n_epochs']) * len(param_grid['lr_all']) * len(param_grid['reg_all'])} parameter combinations...")

# Perform grid search with cross-validation
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)

# Fit the grid search
gs.fit(data)

# Display results
print("\nGrid Search Results:")
print("=" * 50)
print(f"Best RMSE score: {gs.best_score['rmse']:.4f}")
print(f"Best MAE score: {gs.best_score['mae']:.4f}")

print(f"\nBest parameters (RMSE):")
for param, value in gs.best_params['rmse'].items():
    print(f"  {param}: {value}")

print(f"\nBest parameters (MAE):")
for param, value in gs.best_params['mae'].items():
    print(f"  {param}: {value}")

# Train the best model
print(f"\nTraining final model with best parameters...")
best_algo = gs.best_estimator['rmse']
best_algo.fit(trainset)

# Test the optimized model
optimized_predictions = best_algo.test(testset)
optimized_rmse = accuracy.rmse(optimized_predictions, verbose=False)
optimized_mae = accuracy.mae(optimized_predictions, verbose=False)

print(f"\nOptimized Model Performance:")
print(f"RMSE: {optimized_rmse:.4f}")
print(f"MAE: {optimized_mae:.4f}")

print(f"\nImprovement over default SVD:")
print(f"RMSE improvement: {((rmse_score - optimized_rmse) / rmse_score) * 100:.2f}%")
print(f"MAE improvement: {((mae_score - optimized_mae) / mae_score) * 100:.2f}%")


Starting hyperparameter tuning for SVD...
This may take several minutes...
Testing 16 parameter combinations...

Grid Search Results:
Best RMSE score: 0.9340
Best MAE score: 0.7400

Best parameters (RMSE):
  n_factors: 100
  n_epochs: 30
  lr_all: 0.005
  reg_all: 0.1

Best parameters (MAE):
  n_factors: 100
  n_epochs: 30
  lr_all: 0.005
  reg_all: 0.1

Training final model with best parameters...

Optimized Model Performance:
RMSE: 0.8343
MAE: 0.6619

Improvement over default SVD:
RMSE improvement: -23.46%
MAE improvement: -23.96%


**Exercise 1:**
Write a code that recommends items for a users, based on a trained `surprise` model. It should sort the items by their rankings and output top N

In [15]:
# Exercise 1: Item Recommendation System
print("Exercise 1: Building recommendation system...")

# =========================================
# Function to get top N recommendations for all users
def get_top_n_recommendations(predictions, n=10):
    """
    Return the top-N recommendation for each user from a set of predictions.
    
    Args:
        predictions: List of predictions from surprise model
        n: Number of recommendations to return per user
    
    Returns:
        Dictionary with user_ids as keys and list of (item_id, rating) tuples as values
    """
    # Initialize the top_n dict
    top_n = {}
    
    # Group predictions by user
    for prediction in predictions:
        user_id = prediction.uid
        item_id = prediction.iid
        predicted_rating = prediction.est
        
        # Initialize user's recommendation list if not exists
        if user_id not in top_n:
            top_n[user_id] = []
        
        # Add the item and its predicted rating
        top_n[user_id].append((item_id, predicted_rating))
    
    # Sort recommendations by predicted rating (descending) and keep top N
    for user_id in top_n:
        top_n[user_id].sort(key=lambda x: x[1], reverse=True)
        top_n[user_id] = top_n[user_id][:n]
    
    return top_n

# =========================================

# For demonstration, let's create a test set of items that users haven't rated
print("Creating test set for recommendations...")

# Get all user-item pairs that are NOT in the training set
all_items = set([trainset.to_raw_iid(i) for i in range(trainset.n_items)])
all_users = set([trainset.to_raw_uid(u) for u in range(trainset.n_users)])

# Create test set for recommendations (items not rated by users)
recommendation_testset = []
sample_users = list(all_users)[:10]  # Sample first 10 users for demo

for user_id in sample_users:
    # Get items rated by this user
    try:
        inner_user_id = trainset.to_inner_uid(user_id)
        user_items = set([trainset.to_raw_iid(i) for (i, _) in trainset.ur[inner_user_id]])
        
        # Get items NOT rated by this user
        unrated_items = all_items - user_items
        
        # Sample some unrated items for prediction
        import random
        random.seed(42)
        sample_items = random.sample(list(unrated_items), min(50, len(unrated_items)))
        
        for item_id in sample_items:
            recommendation_testset.append((user_id, item_id, 0))  # 0 is dummy rating
            
    except ValueError:
        continue

print(f"Created recommendation test set with {len(recommendation_testset)} user-item pairs")

# Get predictions for unrated items
print("Making predictions for unrated items...")
rec_predictions = algo.test(recommendation_testset)

# Get top 5 recommendations
print("Generating top recommendations...")
top_n = get_top_n_recommendations(rec_predictions, n=5)

# Display recommendations for a sample user
sample_user = '196'
if sample_user in top_n:
    print(f"\nTop 5 recommendations for user {sample_user}:")
    for i, (item_id, rating) in enumerate(top_n[sample_user], 1):
        print(f"{i}. Item {item_id}: Predicted rating {rating:.3f}")
else:
    print(f"No recommendations available for user {sample_user}")
    # Show available users
    available_users = list(top_n.keys())[:5]
    print(f"Available users for recommendations: {available_users}")
    if available_users:
        sample_user = available_users[0]
        print(f"\nTop 5 recommendations for user {sample_user}:")
        for i, (item_id, rating) in enumerate(top_n[sample_user], 1):
            print(f"{i}. Item {item_id}: Predicted rating {rating:.3f}")

Exercise 1: Building recommendation system...
Creating test set for recommendations...
Created recommendation test set with 500 user-item pairs
Making predictions for unrated items...
Generating top recommendations...
No recommendations available for user 196
Available users for recommendations: ['211', '826', '334', '202', '155']

Top 5 recommendations for user 211:
1. Item 87: Predicted rating 3.959
2. Item 474: Predicted rating 3.851
3. Item 1449: Predicted rating 3.778
4. Item 500: Predicted rating 3.709
5. Item 742: Predicted rating 3.670


**Exercise 2** - Address the cold start problem by recommending new users most popular items.

In [16]:
# Exercise 2: Cold Start Problem - Popular Items Recommendation
print("Exercise 2: Addressing Cold Start Problem...")

# Convert the raw ratings to a pandas DataFrame for easier manipulation
df = pd.DataFrame(data.raw_ratings, columns=['user_id', 'item_id', 'rating', 'timestamp'])

print(f"Dataset shape: {df.shape}")
print(f"Rating distribution:")
print(df['rating'].value_counts().sort_index())

# =========================================
# Calculate popularity metrics for each item
print("\nCalculating item popularity metrics...")

# Calculate multiple popularity metrics
item_stats = df.groupby('item_id').agg({
    'rating': ['count', 'mean', 'std'],
    'user_id': 'nunique'
}).round(3)

# Flatten column names
item_stats.columns = ['rating_count', 'avg_rating', 'rating_std', 'unique_users']

# Calculate popularity score (combination of rating count and average rating)
# We weight by number of ratings to avoid items with few but high ratings
min_ratings = 50  # Minimum number of ratings to be considered popular
item_stats['popularity_score'] = (
    item_stats['avg_rating'] * np.log(item_stats['rating_count'] + 1) * 
    (item_stats['rating_count'] >= min_ratings)
)

# Sort by popularity score
item_stats = item_stats.sort_values('popularity_score', ascending=False)

print(f"\nItem statistics (top 10 most popular):")
print(item_stats.head(10))

# Get top N popular items for cold start recommendations
def get_popular_items(n=10):
    """
    Get the most popular items for cold start recommendations
    
    Args:
        n: Number of popular items to return
    
    Returns:
        List of (item_id, avg_rating, rating_count) tuples
    """
    popular_items = item_stats.head(n)
    return [(item_id, row['avg_rating'], row['rating_count']) 
            for item_id, row in popular_items.iterrows()]

# Get top 10 popular items
top_popular_items = get_popular_items(10)

print(f"\nTop 10 Popular Items for Cold Start Users:")
print("-" * 60)
for i, (item_id, avg_rating, count) in enumerate(top_popular_items, 1):
    print(f"{i:2d}. Item {item_id}: Avg Rating {avg_rating:.2f} ({count} ratings)")

# =========================================

# Function to handle cold start recommendations
def recommend_for_cold_start_user(n=5, strategy='popular'):
    """
    Recommend items for a new user (cold start problem)
    
    Args:
        n: Number of recommendations
        strategy: 'popular' or 'diverse'
    
    Returns:
        List of recommended item IDs
    """
    if strategy == 'popular':
        # Simply return most popular items
        return [item_id for item_id, _, _ in get_popular_items(n)]
    
    elif strategy == 'diverse':
        # Return popular items from different rating ranges
        high_rated = item_stats[item_stats['avg_rating'] >= 4.0].head(n//2)
        medium_rated = item_stats[
            (item_stats['avg_rating'] >= 3.5) & 
            (item_stats['avg_rating'] < 4.0)
        ].head(n - len(high_rated))
        
        diverse_items = list(high_rated.index) + list(medium_rated.index)
        return diverse_items[:n]

# Demonstrate cold start recommendations
print(f"\nCold Start Recommendation Strategies:")
print("=" * 50)

# Strategy 1: Most popular items
popular_recs = recommend_for_cold_start_user(5, 'popular')
print(f"Strategy 1 - Most Popular Items:")
for i, item_id in enumerate(popular_recs, 1):
    item_info = item_stats.loc[item_id]
    print(f"  {i}. Item {item_id}: {item_info['avg_rating']:.2f} stars ({item_info['rating_count']} ratings)")

# Strategy 2: Diverse popular items
diverse_recs = recommend_for_cold_start_user(5, 'diverse')
print(f"\nStrategy 2 - Diverse Popular Items:")
for i, item_id in enumerate(diverse_recs, 1):
    item_info = item_stats.loc[item_id]
    print(f"  {i}. Item {item_id}: {item_info['avg_rating']:.2f} stars ({item_info['rating_count']} ratings)")

print(f"\nCold start recommendations can be updated as the user provides more ratings!")

Exercise 2: Addressing Cold Start Problem...
Dataset shape: (100000, 4)
Rating distribution:
rating
1.0     6110
2.0    11370
3.0    27145
4.0    34174
5.0    21201
Name: count, dtype: int64

Calculating item popularity metrics...

Item statistics (top 10 most popular):
         rating_count  avg_rating  rating_std  unique_users  popularity_score
item_id                                                                      
50                583       4.358       0.881           583         27.760028
100               508       4.156       0.976           508         25.902054
127               413       4.283       0.935           413         25.808784
174               420       4.252       0.892           420         25.693275
98                390       4.290       0.837           390         25.605755
318               298       4.466       0.829           298         25.458181
64                283       4.445       0.767           283         25.109690
181               507      

**Exercise 3** - Load the Amazon Product reviews dataset
https://www.kaggle.com/datasets/saurav9786/amazon-product-reviews?resource=download using `surprise` .
Split it into train and test instances.

Find the best algorithm by using cross-validation

In [17]:
# Exercise 3: Amazon Product Reviews Dataset
print("Exercise 3: Amazon Product Reviews Analysis")

# Note: This exercise requires the Amazon dataset to be downloaded
# Download from: https://www.kaggle.com/datasets/saurav9786/amazon-product-reviews
# Place the CSV file in the same directory as this notebook

file_path = 'ratings_Electronics (1).csv'

# Check if file exists
import os
if os.path.exists(file_path):
    print(f"Loading Amazon dataset from {file_path}")
    
    # Load the dataset
    # Assuming the CSV has columns: user_id, product_id, rating, timestamp
    df_amazon = pd.read_csv(file_path, header=None, names=['user_id', 'product_id', 'rating', 'timestamp'])
    
    print(f"Dataset shape: {df_amazon.shape}")
    print(f"Dataset info:")
    print(df_amazon.info())
    
    print(f"\nFirst few rows:")
    print(df_amazon.head())
    
    print(f"\nRating distribution:")
    print(df_amazon['rating'].value_counts().sort_index())
    
    # Basic statistics
    print(f"\nDataset Statistics:")
    print(f"Number of users: {df_amazon['user_id'].nunique():,}")
    print(f"Number of products: {df_amazon['product_id'].nunique():,}")
    print(f"Number of ratings: {len(df_amazon):,}")
    print(f"Average rating: {df_amazon['rating'].mean():.2f}")
    print(f"Rating range: {df_amazon['rating'].min()} - {df_amazon['rating'].max()}")
    
    # Sparsity calculation
    n_users_amz = df_amazon['user_id'].nunique()
    n_items_amz = df_amazon['product_id'].nunique()
    n_ratings_amz = len(df_amazon)
    sparsity = (1 - n_ratings_amz / (n_users_amz * n_items_amz)) * 100
    print(f"Sparsity: {sparsity:.2f}%")
    
    # Prepare data for Surprise
    print(f"\nPreparing data for Surprise library...")
    
    # Create a Reader object
    reader = Reader(rating_scale=(1, 5))
    
    # Load data into Surprise format
    amazon_data = Dataset.load_from_df(df_amazon[['user_id', 'product_id', 'rating']], reader)
    
    # Split the data
    amazon_trainset, amazon_testset = train_test_split(amazon_data, test_size=0.2, random_state=42)
    
    print(f"Training set size: {len(amazon_trainset)}")
    print(f"Test set size: {len(amazon_testset)}")
    
    # Test different algorithms
    print(f"\nTesting different algorithms with cross-validation...")
    
    algorithms = {
        'SVD': SVD(random_state=42),
        'NMF': NMF(random_state=42),
        'KNN_User': KNNBasic(sim_options={'user_based': True}, random_state=42),
        'KNN_Item': KNNBasic(sim_options={'user_based': False}, random_state=42)
    }
    
    results = {}
    
    for name, algorithm in algorithms.items():
        print(f"\nEvaluating {name}...")
        
        # Perform cross-validation
        cv_results = cross_validate(algorithm, amazon_data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
        
        results[name] = {
            'RMSE': cv_results['test_rmse'].mean(),
            'RMSE_std': cv_results['test_rmse'].std(),
            'MAE': cv_results['test_mae'].mean(),
            'MAE_std': cv_results['test_mae'].std()
        }
        
        print(f"  RMSE: {results[name]['RMSE']:.4f} (+/- {results[name]['RMSE_std']:.4f})")
        print(f"  MAE:  {results[name]['MAE']:.4f} (+/- {results[name]['MAE_std']:.4f})")
    
    # Find the best algorithm
    best_algorithm = min(results.keys(), key=lambda x: results[x]['RMSE'])
    
    print(f"\nBest Algorithm: {best_algorithm}")
    print(f"Best RMSE: {results[best_algorithm]['RMSE']:.4f}")
    print(f"Best MAE: {results[best_algorithm]['MAE']:.4f}")
    
    # Train the best algorithm on full training set
    print(f"\nTraining {best_algorithm} on full training set...")
    best_algo = algorithms[best_algorithm]
    best_algo.fit(amazon_trainset)
    
    # Test on test set
    final_predictions = best_algo.test(amazon_testset)
    final_rmse = accuracy.rmse(final_predictions, verbose=False)
    final_mae = accuracy.mae(final_predictions, verbose=False)
    
    print(f"Final test results:")
    print(f"RMSE: {final_rmse:.4f}")
    print(f"MAE: {final_mae:.4f}")
    
else:
    print(f"Amazon dataset not found at {file_path}")
    print("Please download the dataset from:")
    print("https://www.kaggle.com/datasets/saurav9786/amazon-product-reviews")
    print("And place it in the same directory as this notebook.")
    print("\nFor now, we'll use the MovieLens dataset for demonstration...")
    
    # Fallback to MovieLens for demonstration
    print("\nUsing MovieLens dataset for algorithm comparison...")
    
    algorithms = {
        'SVD': SVD(random_state=42),
        'NMF': NMF(random_state=42),
        'KNN_User': KNNBasic(sim_options={'user_based': True}, random_state=42),
        'KNN_Item': KNNBasic(sim_options={'user_based': False}, random_state=42)
    }
    
    results = {}
    
    for name, algorithm in algorithms.items():
        print(f"\nEvaluating {name} on MovieLens...")
        
        # Perform cross-validation
        cv_results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
        
        results[name] = {
            'RMSE': cv_results['test_rmse'].mean(),
            'RMSE_std': cv_results['test_rmse'].std(),
            'MAE': cv_results['test_mae'].mean(),
            'MAE_std': cv_results['test_mae'].std()
        }
        
        print(f"  RMSE: {results[name]['RMSE']:.4f} (+/- {results[name]['RMSE_std']:.4f})")
        print(f"  MAE:  {results[name]['MAE']:.4f} (+/- {results[name]['MAE_std']:.4f})")
    
    # Find the best algorithm
    best_algorithm = min(results.keys(), key=lambda x: results[x]['RMSE'])
    
    print(f"\nBest Algorithm: {best_algorithm}")
    print(f"Best RMSE: {results[best_algorithm]['RMSE']:.4f}")
    print(f"Best MAE: {results[best_algorithm]['MAE']:.4f}")



Exercise 3: Amazon Product Reviews Analysis
Amazon dataset not found at ratings_Electronics (1).csv
Please download the dataset from:
https://www.kaggle.com/datasets/saurav9786/amazon-product-reviews
And place it in the same directory as this notebook.

For now, we'll use the MovieLens dataset for demonstration...

Using MovieLens dataset for algorithm comparison...

Evaluating SVD on MovieLens...
  RMSE: 0.9452 (+/- 0.0018)
  MAE:  0.7458 (+/- 0.0020)

Evaluating NMF on MovieLens...
  RMSE: 0.9746 (+/- 0.0027)
  MAE:  0.7652 (+/- 0.0030)

Evaluating KNN_User on MovieLens...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
  RMSE: 0.9883 (+/- 0.0017)
  MAE:  0.7811 (+/- 0.0020)

Evaluating KNN_Item on MovieLens...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matr

**Exercise 4** - Convert the ratings matrix to implicit feedback matrix by changing all positive ratings to 0




In [18]:
# Exercise 4: Convert to Implicit Feedback Matrix
print("Exercise 4: Converting to Implicit Feedback Matrix")

# Convert explicit ratings to implicit feedback
# All positive ratings (>= 1) become 1, indicating the user interacted with the item
print("Converting explicit feedback to implicit feedback...")

# Create implicit feedback matrix from the original interaction matrix
implicit_interaction_matrix = interaction_matrix.copy()

# Convert all non-zero ratings to 1 (indicating interaction)
implicit_interaction_matrix.data = np.ones_like(implicit_interaction_matrix.data)

print(f"Original matrix statistics:")
print(f"  Shape: {interaction_matrix.shape}")
print(f"  Non-zero entries: {interaction_matrix.nnz:,}")
print(f"  Rating range: {interaction_matrix.data.min():.1f} - {interaction_matrix.data.max():.1f}")

print(f"\nImplicit matrix statistics:")
print(f"  Shape: {implicit_interaction_matrix.shape}")
print(f"  Non-zero entries: {implicit_interaction_matrix.nnz:,}")
print(f"  All values are now: {np.unique(implicit_interaction_matrix.data)}")

# Display comparison of a small portion
print(f"\nComparison of first 10x10 portion:")
print("Original ratings matrix:")
print(interaction_matrix[:10, :10].toarray())

print("\nImplicit feedback matrix (1 = interaction, 0 = no interaction):")
print(implicit_interaction_matrix[:10, :10].toarray())

# Statistics about user interactions
user_interactions = np.array(implicit_interaction_matrix.sum(axis=1)).flatten()
item_interactions = np.array(implicit_interaction_matrix.sum(axis=0)).flatten()

print(f"\nUser interaction statistics:")
print(f"  Average interactions per user: {user_interactions.mean():.2f}")
print(f"  Min interactions per user: {user_interactions.min()}")
print(f"  Max interactions per user: {user_interactions.max()}")

print(f"\nItem interaction statistics:")
print(f"  Average interactions per item: {item_interactions.mean():.2f}")
print(f"  Min interactions per item: {item_interactions.min()}")
print(f"  Max interactions per item: {item_interactions.max()}")

print(f"\nImplicit feedback matrix is ready for use with models like EASE!")


Exercise 4: Converting to Implicit Feedback Matrix
Converting explicit feedback to implicit feedback...
Original matrix statistics:
  Shape: (943, 1682)
  Non-zero entries: 100,000
  Rating range: 1.0 - 5.0

Implicit matrix statistics:
  Shape: (943, 1682)
  Non-zero entries: 100,000
  All values are now: [1.]

Comparison of first 10x10 portion:
Original ratings matrix:
[[3. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 3. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 3. 0. 4. 0.]
 [0. 0. 0. 2. 0. 0. 4. 0. 4. 4.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 4. 4. 4. 0. 0.]
 [0. 4. 0. 0. 0. 0. 2. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 5. 0. 0.]
 [5. 4. 0. 0. 0. 5. 0. 0. 3. 4.]
 [4. 4. 0. 0. 0. 5. 0. 1. 0. 3.]]

Implicit feedback matrix (1 = interaction, 0 = no interaction):
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 1. 1.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 1. 0. 0. 0.]


**Exercise 5** - Read the EASE model paper (https://arxiv.org/abs/1905.03375).
Implement the algorithm in the paper and apply it on the *implicit feedback matrix* of ml-100k to recommend items to users.
Write two parts -
 *Training* in which the model matrix is created
 *Inference* in which the interaction scores per user are predicted

Hints:
---
Training:
- Start with computing the Gram matrix
- Add lambda regularization to the diagonal as described in the paper
- Compute the inverse and find the item-item interaction matrix,
- Zero the diagonal indexes

Inference:
- Multiply the user interaction matrix by the item-item interaction matrix

In [19]:
# Exercise 5: EASE Model Implementation
print("Exercise 5: Implementing EASE (Embarrassingly Shallow Autoencoders)")
print("Based on the paper: https://arxiv.org/abs/1905.03375")

class EASE:
    """
    EASE (Embarrassingly Shallow Autoencoders for Sparse Data) Implementation
    
    This model learns item-item similarities through a closed-form solution
    that can be computed efficiently using matrix operations.
    """
    
    def __init__(self, lambda_reg=0.5):
        """
        Initialize EASE model
        
        Args:
            lambda_reg: Regularization parameter (lambda in the paper)
        """
        self.lambda_reg = lambda_reg
        self.B = None  # Item-item similarity matrix
        self.trained = False
        
    def fit(self, X):
        """
        Train the EASE model
        
        Args:
            X: User-item interaction matrix (users x items)
               Should be in implicit feedback format (0s and 1s)
        """
        print(f"Training EASE model with lambda_reg={self.lambda_reg}")
        
        # Step 1: Compute the Gram matrix (X^T * X)
        print("Step 1: Computing Gram matrix...")
        X_dense = X.toarray() if hasattr(X, 'toarray') else X
        G = X_dense.T @ X_dense  # Item-item co-occurrence matrix
        
        print(f"Gram matrix shape: {G.shape}")
        print(f"Gram matrix density: {np.count_nonzero(G) / G.size * 100:.2f}%")
        
        # Step 2: Add regularization to diagonal
        print("Step 2: Adding regularization to diagonal...")
        np.fill_diagonal(G, G.diagonal() + self.lambda_reg)
        
        # Step 3: Compute the inverse
        print("Step 3: Computing matrix inverse...")
        try:
            G_inv = np.linalg.inv(G)
        except np.linalg.LinAlgError:
            print("Matrix is singular, using pseudo-inverse...")
            G_inv = np.linalg.pinv(G)
        
        # Step 4: Compute item-item similarity matrix B
        print("Step 4: Computing item-item similarity matrix...")
        self.B = G_inv / (-np.diag(G_inv))
        
        # Step 5: Zero out the diagonal (items don't recommend themselves)
        print("Step 5: Zeroing diagonal elements...")
        np.fill_diagonal(self.B, 0.0)
        
        self.trained = True
        print("EASE model training completed!")
        
        # Print some statistics
        print(f"Similarity matrix statistics:")
        print(f"  Shape: {self.B.shape}")
        print(f"  Non-zero elements: {np.count_nonzero(self.B):,}")
        print(f"  Density: {np.count_nonzero(self.B) / self.B.size * 100:.2f}%")
        print(f"  Value range: {self.B.min():.4f} to {self.B.max():.4f}")
        
        return self
    
    def predict(self, X):
        """
        Generate predictions for all user-item pairs
        
        Args:
            X: User-item interaction matrix (same as training)
            
        Returns:
            Predicted scores matrix (users x items)
        """
        if not self.trained:
            raise ValueError("Model must be trained before making predictions")
        
        print("Generating predictions...")
        X_dense = X.toarray() if hasattr(X, 'toarray') else X
        
        # Compute predictions: X * B
        predictions = X_dense @ self.B
        
        print(f"Predictions shape: {predictions.shape}")
        print(f"Prediction range: {predictions.min():.4f} to {predictions.max():.4f}")
        
        return predictions
    
    def recommend(self, X, user_id, n_recommendations=10, remove_seen=True):
        """
        Get top-N recommendations for a specific user
        
        Args:
            X: User-item interaction matrix
            user_id: ID of the user (row index)
            n_recommendations: Number of recommendations to return
            remove_seen: Whether to remove items the user has already interacted with
            
        Returns:
            List of (item_id, score) tuples sorted by score (descending)
        """
        if not self.trained:
            raise ValueError("Model must be trained before making recommendations")
        
        # Get predictions for all items for this user
        predictions = self.predict(X)
        user_scores = predictions[user_id]
        
        # Create list of (item_id, score) pairs
        item_scores = [(i, score) for i, score in enumerate(user_scores)]
        
        # Remove items the user has already interacted with
        if remove_seen:
            X_dense = X.toarray() if hasattr(X, 'toarray') else X
            seen_items = set(np.where(X_dense[user_id] > 0)[0])
            item_scores = [(i, score) for i, score in item_scores if i not in seen_items]
        
        # Sort by score (descending) and return top N
        item_scores.sort(key=lambda x: x[1], reverse=True)
        return item_scores[:n_recommendations]

# Train the EASE model
print("\n" + "="*60)
print("Training EASE Model on MovieLens Implicit Feedback Data")
print("="*60)

# Initialize and train the model
ease_model = EASE(lambda_reg=0.5)
ease_model.fit(implicit_interaction_matrix)

# Generate predictions
print("\nGenerating predictions for all users...")
ease_predictions = ease_model.predict(implicit_interaction_matrix)

# Demonstrate recommendations for a sample user
sample_user_id = 0  # First user
print(f"\nGenerating recommendations for User {sample_user_id}:")

# Get recommendations
recommendations = ease_model.recommend(
    implicit_interaction_matrix, 
    user_id=sample_user_id, 
    n_recommendations=10,
    remove_seen=True
)

print(f"Top 10 recommendations for User {sample_user_id}:")
print("-" * 40)
for i, (item_id, score) in enumerate(recommendations, 1):
    print(f"{i:2d}. Item {item_id:3d}: Score {score:.4f}")

# Show what items this user has already interacted with
user_items = np.where(implicit_interaction_matrix[sample_user_id].toarray()[0] > 0)[0]
print(f"\nItems User {sample_user_id} has already interacted with:")
print(f"Total items: {len(user_items)}")
print(f"Item IDs: {user_items[:10].tolist()}{'...' if len(user_items) > 10 else ''}")

# Evaluation: Calculate recommendation quality
print(f"\nModel Evaluation:")
print(f"Total trainable parameters: 0 (closed-form solution)")
print(f"Item-item similarity matrix size: {ease_model.B.shape[0]} x {ease_model.B.shape[1]}")
print(f"Memory usage: ~{ease_model.B.nbytes / 1024**2:.2f} MB")

print(f"\nEASE model successfully implemented and tested!")
print(f"The model can now be used for real-time recommendations.")

Exercise 5: Implementing EASE (Embarrassingly Shallow Autoencoders)
Based on the paper: https://arxiv.org/abs/1905.03375

Training EASE Model on MovieLens Implicit Feedback Data
Training EASE model with lambda_reg=0.5
Step 1: Computing Gram matrix...
Gram matrix shape: (1682, 1682)
Gram matrix density: 69.57%
Step 2: Adding regularization to diagonal...
Step 3: Computing matrix inverse...
Step 4: Computing item-item similarity matrix...
Step 5: Zeroing diagonal elements...
EASE model training completed!
Similarity matrix statistics:
  Shape: (1682, 1682)
  Non-zero elements: 2,827,442
  Density: 99.94%
  Value range: -0.6413 to 0.6978

Generating predictions for all users...
Generating predictions...
Predictions shape: (943, 1682)
Prediction range: -0.4658 to 1.3482

Generating recommendations for User 0:
Generating predictions...
Predictions shape: (943, 1682)
Prediction range: -0.4658 to 1.3482
Top 10 recommendations for User 0:
----------------------------------------
 1. Item 209: 