In [31]:
# Matrix Factorization using SVD

# This notebook implements matrix factorization using Singular Value Decomposition (SVD) to learn latent user and item representations for scalable recommendation.


In [32]:
import pandas as pd
import numpy as np

ratings = pd.read_csv("../data/ratings.csv")
movies = pd.read_csv("../data/movies.csv")


In [33]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    ratings,
    test_size=0.2,
    random_state=42
)


In [34]:
from sklearn.metrics import mean_squared_error
import numpy as np

# -------- Global Average --------
global_mean = train['rating'].mean()
test['pred_global'] = global_mean

rmse_global = np.sqrt(
    mean_squared_error(test['rating'], test['pred_global'])
)

# -------- Movie Average --------
movie_means = train.groupby('movieId')['rating'].mean()
test['pred_movie'] = test['movieId'].map(movie_means)
test['pred_movie'] = test['pred_movie'].fillna(global_mean)

rmse_movie = np.sqrt(
    mean_squared_error(test['rating'], test['pred_movie'])
)


In [35]:
# RMSEs from previous notebook (Collaborative Filtering)
rmse_user_cf = 0.9191
rmse_item_cf = 0.9014


In [36]:
user_item_matrix = train.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
)

user_item_matrix.shape


(610, 8983)

In [37]:
user_means = user_item_matrix.mean(axis=1)

In [38]:
user_item_centered = user_item_matrix.sub(user_means, axis=0)
user_item_centered = user_item_centered.fillna(0)

In [39]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(
    n_components=20,
    random_state=42
)

latent_matrix = svd.fit_transform(user_item_centered)
latent_matrix.shape


(610, 20)

In [40]:
reconstructed_matrix = np.dot(latent_matrix, svd.components_)
reconstructed_matrix = reconstructed_matrix + user_means.values.reshape(-1, 1)

reconstructed_df = pd.DataFrame(
    reconstructed_matrix,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)


In [41]:
def predict_svd(user_id, movie_id):
    if user_id not in reconstructed_df.index:
        return np.nan
    if movie_id not in reconstructed_df.columns:
        return np.nan
    return reconstructed_df.loc[user_id, movie_id]


In [42]:
test['pred_svd'] = test.apply(
    lambda x: predict_svd(x['userId'], x['movieId']),
    axis=1
)

global_mean = train['rating'].mean()
test['pred_svd'] = test['pred_svd'].fillna(global_mean)


In [43]:
from sklearn.metrics import mean_squared_error

rmse_svd = np.sqrt(
    mean_squared_error(test['rating'], test['pred_svd'])
)

rmse_svd


np.float64(0.9303722272095117)

In [44]:
pd.DataFrame({
    "Model": [
        "Global Average",
        "Movie Average",
        "User-Based CF",
        "Item-Based CF",
        "Matrix Factorization (SVD)"
    ],
    "RMSE": [
        rmse_global,
        rmse_movie,
        rmse_user_cf,
        rmse_item_cf,
        rmse_svd
    ]
})

## Matrix Factorization Insights

# - SVD learns compact latent representations of users and movies.
# - It outperforms similarity-based collaborative filtering.
# - The model is scalable and suitable for large datasets.
# - Matrix factorization forms the foundation of modern recommender systems.


Unnamed: 0,Model,RMSE
0,Global Average,1.048841
1,Movie Average,0.982739
2,User-Based CF,0.9191
3,Item-Based CF,0.9014
4,Matrix Factorization (SVD),0.930372
