In [1]:
# Matrix Factorization using Surprise SVD

# This notebook uses the Surprise libraryâ€™s SVD implementation, which includes bias terms and regularization, similar to production recommender systems.


In [2]:
! pip install scikit-surprise




In [3]:
import sys
sys.executable


'/Users/admin/Desktop/recommender-research/venv/bin/python3.12'

In [4]:
!{sys.executable} -m pip uninstall -y numpy
!{sys.executable} -m pip install "numpy<2.0"
!{sys.executable} -m pip uninstall -y scikit-surprise
!{sys.executable} -m pip install scikit-surprise


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy<2.0
  Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4
Found existing installation: scikit-surprise 1.1.4
Uninstalling scikit-surprise-1.1.4:
  Successfully uninstalled scikit-surprise-1.1.4
Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4-cp312-cp312-macosx_15_0_arm64.whl
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [5]:
import numpy as np
np.__version__


'1.26.4'

In [7]:
import pandas as pd
import numpy as np

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy


In [8]:
ratings = pd.read_csv("../data/ratings.csv")
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(
    ratings[['userId', 'movieId', 'rating']],
    reader
)


In [10]:
trainset, testset = train_test_split(
    data,
    test_size=0.2,
    random_state=42
)


In [11]:
svd = SVD(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02,
    random_state=42
)

svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11c5690a0>

In [12]:
predictions = svd.test(testset)

rmse_surprise = accuracy.rmse(predictions)


RMSE: 0.8775


In [13]:
rmse_global = 1.048841
rmse_movie = 0.982739
rmse_user_cf = 0.919100
rmse_item_cf = 0.901400

pd.DataFrame({
    "Model": [
        "Global Average",
        "Movie Average",
        "User-Based CF",
        "Item-Based CF",
        "Surprise SVD"
    ],
    "RMSE": [
        rmse_global,
        rmse_movie,
        rmse_user_cf,
        rmse_item_cf,
        rmse_surprise
    ]
})

## Why Surprise SVD Performs Best

# - Explicit modeling of user and item biases
# - Regularization prevents overfitting
# - Latent factor learning captures hidden preferences
# - Industry-standard implementation used in research and production


Unnamed: 0,Model,RMSE
0,Global Average,1.048841
1,Movie Average,0.982739
2,User-Based CF,0.9191
3,Item-Based CF,0.9014
4,Surprise SVD,0.877468
