**ADVANCED TASKS**

1 st one : 

Create and train/test split on the user rating data to evaluate the predictions made by the built
recommender system. Use metrics such Mean Absolute and Square Error (MAE/MSE) to assess
the quality of the predicted ratings. Select user subset that have at least a certain number of
ratings across books. Additionally, optimize the number of “features” on a held out validation
test according to the quality of the approximation.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.sparse import csr_matrix

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict


In [None]:
ratings = pd.read_csv("ratings.csv")
books = pd.read_csv("books.csv")

In [None]:
user_rating_counts = ratings['user_id'].value_counts()
active_users = user_rating_counts[user_rating_counts >= 10].index
filtered_ratings = ratings[ratings['user_id'].isin(active_users)]

Now we split the ratings 

In [None]:
train_ratings, test_ratings = train_test_split(filtered_ratings, test_size=0.2, random_state=42)


In [None]:
user_book_matrix_train = train_ratings.pivot(index='user_id', columns='book_id', values='rating').fillna(0)
user_book_matrix_test = test_ratings.pivot(index='user_id', columns='book_id', values='rating').fillna(0)


In [None]:
sparse_matrix_train = csr_matrix(user_book_matrix_train.values)
sparse_matrix_test = csr_matrix(user_book_matrix_test.values)


In [None]:
k_values = [10, 20, 50, 100]
mae_scores = []
mse_scores = []


function to be interated into a script 

In [None]:
for k in k_values:
    svd = TruncatedSVD(n_components=k, random_state=42)
    U_train = svd.fit_transform(sparse_matrix_train)
    Sigma_train = np.diag(svd.singular_values_)
    Vt_train = svd.components_
    
    R_approx = np.dot(U_train, np.dot(Sigma_train, Vt_train))

    predicted_ratings = pd.DataFrame(R_approx, index=user_book_matrix_train.index, columns=user_book_matrix_train.columns)
    test_ratings_filtered = test_ratings[test_ratings['user_id'].isin(user_book_matrix_train.index) & test_ratings['book_id'].isin(user_book_matrix_train.columns)]
    
    test_ratings_filtered['predicted_rating'] = test_ratings_filtered.apply(lambda row: predicted_ratings.loc[row['user_id'], row['book_id']] if row['user_id'] in predicted_ratings.index and row['book_id'] in predicted_ratings.columns else np.nan, axis=1)
    test_ratings_filtered = test_ratings_filtered.dropna()
    
    # Compute MAE and MSE
    mae = mean_absolute_error(test_ratings_filtered['rating'], test_ratings_filtered['predicted_rating'])
    mse = mean_squared_error(test_ratings_filtered['rating'], test_ratings_filtered['predicted_rating'])
    mae_scores.append(mae)
    mse_scores.append(mse)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(k_values, mae_scores, marker='o', label='MAE')
plt.plot(k_values, mse_scores, marker='s', label='MSE')
plt.xlabel("Number of Latent Features (k)")
plt.ylabel("Error")
plt.title("SVD Optimization: MAE & MSE vs. Number of Features")
plt.legend()
plt.grid()
plt.show()

In [None]:
best_k = k_values[np.argmin(mae_scores)]
print(f"Optimal number of latent features: {best_k}")

**SECOND ADVANCED TASK**

In [None]:
user_rating_counts = ratings['user_id'].value_counts()
active_users = user_rating_counts[user_rating_counts >= 10].index
filtered_ratings = ratings[ratings['user_id'].isin(active_users)]

In [None]:
reader = Reader(rating_scale=(1, 5))  # Assuming ratings are 1-5 scale
data = Dataset.load_from_df(filtered_ratings[['user_id', 'book_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Initialize and train SVD model
algo = SVD(n_factors=10, random_state=42)
algo.fit(trainset)

Prepare data for Surprise

In [None]:
predictions = algo.test(testset)
mae = accuracy.mae(predictions)
mse = accuracy.rmse(predictions) ** 2  
print(f"MAE with 10 factors: {mae:.4f}")
print(f"MSE with 10 factors: {mse:.4f}")

In [None]:

factor_values = [10, 20, 50, 100]
mae_scores = []
mse_scores = []

for n_factors in factor_values:
    # Train model
    algo = SVD(n_factors=n_factors, random_state=42)
    algo.fit(trainset)
    
    # Make predictions
    predictions = algo.test(testset)
    
    # Calculate metrics
    mae = accuracy.mae(predictions, verbose=False)
    mse = accuracy.rmse(predictions, verbose=False) ** 2
    
    mae_scores.append(mae)
    mse_scores.append(mse)
    print(f"\nFactors: {n_factors}")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")

we should now find the optimal number of factors based on MAE

In [None]:
optimal_factors = factor_values[np.argmin(mae_scores)]
print(f"\nOptimal number of factors based on MAE: {optimal_factors}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(factor_values, mae_scores, marker='o', label='MAE')
plt.plot(factor_values, mse_scores, marker='s', label='MSE')
plt.xlabel("Number of Latent Factors")
plt.ylabel("Error")
plt.title("SVD Optimization (Surprise): MAE & MSE vs. Number of Factors")
plt.legend()
plt.grid()
plt.show()