## Implement a Recommender System based on Singular Value Decomposition (SVD) and evaluate the performance using Hit Ratio @ 10

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
from benchmark.calculate_hit_ratio_svd import calculate_hit_ratio_svd

In [None]:
from utils.svd_utils import create_user_item_matrix, apply_svd, predict_ratings

#### Calculate Hit Ratio @ 10 for each folder from (u1.base, u1.test) to (u5.base, u5.test) and each number of latent factors from 5 to 50 with a step of 5

In [6]:
# Load data and get a list of all movies
data = pd.read_csv('../data/raw/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
all_movies = np.arange(1, max(data['movie_id']) + 1)

In [7]:
# Calculate Hit Ratio @ 10 for each folder and each number of latent factors
latent_factors = [i for i in range(5, 51, 5)]
fold_hit_ratio_results = {k: [] for k in latent_factors}

In [9]:
for k in tqdm(latent_factors, desc='Latent Factors'):
    for fold in range(1, 6):
        # Load and prepare data
        train_matrix_csr, train_matrix_df = create_user_item_matrix(f'../data/raw/u{fold}.base')
        train_data = pd.read_csv(f'../data/raw/u{fold}.base', sep='\t',
                                 names=['user_id', 'movie_id', 'rating', 'timestamp'])
        test_data = pd.read_csv(f'../data/raw/u{fold}.test', sep='\t',
                                names=['user_id', 'movie_id', 'rating', 'timestamp'])

        # Apply SVD
        u, sigma, vt = apply_svd(train_matrix_csr, k)

        # Predict ratings
        predicted_ratings_df = predict_ratings(u, sigma, vt, train_matrix_df, all_movies)

        # Calculate Hit Ratio @ 10
        hit_ratio = calculate_hit_ratio_svd(test_data, predicted_ratings_df, all_movies)
        fold_hit_ratio_results[k].append(hit_ratio)

    avg_hit_ratio = np.mean(fold_hit_ratio_results[k])
    print(f'Average Hit Ratio @ 10 for {k} latent factors: {avg_hit_ratio:.4f}')

Latent Factors:  10%|█         | 1/10 [00:03<00:31,  3.54s/it]

Average Hit Ratio @ 10 for 5 latent factors: 0.7726


Latent Factors:  20%|██        | 2/10 [00:07<00:28,  3.61s/it]

Average Hit Ratio @ 10 for 10 latent factors: 0.7974


Latent Factors:  30%|███       | 3/10 [00:10<00:25,  3.68s/it]

Average Hit Ratio @ 10 for 15 latent factors: 0.7928


Latent Factors:  40%|████      | 4/10 [00:14<00:22,  3.74s/it]

Average Hit Ratio @ 10 for 20 latent factors: 0.7818


Latent Factors:  50%|█████     | 5/10 [00:18<00:19,  3.81s/it]

Average Hit Ratio @ 10 for 25 latent factors: 0.7771


Latent Factors:  60%|██████    | 6/10 [00:22<00:15,  3.87s/it]

Average Hit Ratio @ 10 for 30 latent factors: 0.7660


Latent Factors:  70%|███████   | 7/10 [00:26<00:11,  3.94s/it]

Average Hit Ratio @ 10 for 35 latent factors: 0.7615


Latent Factors:  80%|████████  | 8/10 [00:30<00:07,  4.00s/it]

Average Hit Ratio @ 10 for 40 latent factors: 0.7439


Latent Factors:  90%|█████████ | 9/10 [00:35<00:04,  4.05s/it]

Average Hit Ratio @ 10 for 45 latent factors: 0.7362


Latent Factors: 100%|██████████| 10/10 [00:39<00:00,  3.93s/it]

Average Hit Ratio @ 10 for 50 latent factors: 0.7251





In [13]:
best_k = max(fold_hit_ratio_results, key=fold_hit_ratio_results.get)
print(f'Best number of latent factors: {best_k}')
print(f'Average Hit Ratio @ 10: {np.mean(fold_hit_ratio_results.get(best_k)):.4f}')
print(f'Best Hit Ratio @ 10: {max(fold_hit_ratio_results.get(best_k)):.4f}')

Best number of latent factors: 10
Average Hit Ratio @ 10: 0.7974
Best Hit Ratio @ 10: 0.8200
