In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install scikit-surprise
!pip install numpy==1.23.5 --upgrade

!pip install cython --upgrade
!pip install scikit-surprise --upgrade

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m114.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
albumentations 2.0.8 requires numpy>=1.24.4, but you have numpy 1.23.5 which is incompatible.
xarray 2025.3.1 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
jax 0.5.2 requires nu

Collecting cython
  Using cached cython-3.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.9 kB)
Using cached cython-3.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[31mERROR: Operation cancelled by user[0m[31m
[0m^C


In [None]:

import requests
import zipfile
import io

print("Library installed and modules imported.")

#Download dataset
print("Downloading MovieLens dataset...")
url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))

print("Unzipping dataset...")
z.extractall()
print("Dataset is ready in the 'ml-latest-small' folder.")

Library installed and modules imported.
Downloading MovieLens dataset...
Unzipping dataset...
Dataset is ready in the 'ml-latest-small' folder.


In [None]:
#Imports and Data Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import zipfile
import io

# Downloaded and unzipped data
print("Loading data...")
try:
    ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
    movies_df = pd.read_csv('ml-latest-small/movies.csv')
    print("Data found locally.")
except FileNotFoundError:
    print("Data not found. Downloading and unzipping...")
    url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
    r = requests.get(url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall()
    ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
    movies_df = pd.read_csv('ml-latest-small/movies.csv')
    print("Dataset is ready.")

df = pd.merge(ratings_df, movies_df, on='movieId')
print(df.head())

Loading data...
Data found locally.
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [None]:
#Prepare Data for Surprise and Run Baseline Model

from surprise import Reader, Dataset, BaselineOnly
from surprise.model_selection import cross_validate

#The Reader object helps parse the dataframe
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

# Model 1: BaselineOnly
print("Evaluating BaselineOnly Model...")
algo_baseline = BaselineOnly()

# 5-fold cross-validation
results_baseline = cross_validate(algo_baseline, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Storing the mean RMSE for later comparison
rmse_baseline = results_baseline['test_rmse'].mean()
print(f"\nAverage RMSE for BaselineOnly: {rmse_baseline:.4f}")

Evaluating BaselineOnly Model...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8745  0.8774  0.8683  0.8809  0.8613  0.8725  0.0069  
MAE (testset)     0.6740  0.6749  0.6737  0.6769  0.6651  0.6729  0.0040  
Fit time          0.86    0.41    0.32    0.74    0.37    0.54    0.22    
Test time         0.12    0.13    0.17    0.42    0.09    0.19    0.12    

Average RMSE for BaselineOnly: 0.8725


In [None]:
from surprise import SVD

# Model 2: SVD
print("\nEvaluating SVD Model...")
algo_svd = SVD()

# 5-fold cross-validation
results_svd = cross_validate(algo_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Storing the mean RMSE for comparison
rmse_svd = results_svd['test_rmse'].mean()
print(f"\nAverage RMSE for SVD: {rmse_svd:.4f}")


Evaluating SVD Model...
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8658  0.8660  0.8777  0.8824  0.8772  0.8738  0.0067  
MAE (testset)     0.6638  0.6678  0.6733  0.6771  0.6720  0.6708  0.0046  
Fit time          1.23    1.10    1.09    1.12    1.09    1.13    0.06    
Test time         0.22    0.09    0.20    0.10    0.27    0.17    0.07    

Average RMSE for SVD: 0.8738


In [None]:
from surprise import SVDpp

# Model 3: SVD++
print("\nEvaluating SVD++ Model...")
algo_svdpp = SVDpp()

# 5-fold cross-validation
results_svdpp = cross_validate(algo_svdpp, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Storing the mean RMSE for comparison
rmse_svdpp = results_svdpp['test_rmse'].mean()
print(f"\nAverage RMSE for SVD++: {rmse_svdpp:.4f}")


Evaluating SVD++ Model...
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8591  0.8626  0.8617  0.8611  0.8627  0.8614  0.0013  
MAE (testset)     0.6613  0.6638  0.6612  0.6589  0.6580  0.6606  0.0020  
Fit time          76.40   76.65   76.30   75.77   75.98   76.22   0.31    
Test time         7.88    8.22    8.00    8.08    8.06    8.05    0.11    

Average RMSE for SVD++: 0.8614


In [None]:
# Step 1: Comparing Models Performance
model_comparison = pd.DataFrame({
    'Model': ['BaselineOnly', 'SVD', 'SVD++'],
    'RMSE': [rmse_baseline, rmse_svd, rmse_svdpp]
})

print("--- Model Performance Comparison ---")
print(model_comparison.sort_values(by='RMSE'))


# Step 2: Training the final SVD++ model on the whole dataset
trainset = data.build_full_trainset()
final_model = SVDpp()
print("\nTraining the final SVD++ model on the entire dataset...")
final_model.fit(trainset)
print("Training complete.")


# Step 3: Creating a function to get top-N recommendations
def get_top_n_recommendations(user_id, n=10):
    """Returns the top-N recommendation for a given user."""
    #list of all movie ids
    all_movie_ids = df['movieId'].unique()

    #list of movies the user has already rated
    movies_rated_by_user = df[df['userId'] == user_id]['movieId'].unique()

    #the movies the user has NOT rated
    movies_to_predict = np.setdiff1d(all_movie_ids, movies_rated_by_user)

    #Predict ratings for all unrated movies
    predictions = [final_model.predict(uid=user_id, iid=movie_id) for movie_id in movies_to_predict]

    #Sorting the predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)

    #Getting the top N recommendations
    top_n_preds = predictions[:n]

    #Getting the movie titles for the recommended ids
    top_n_movie_ids = [pred.iid for pred in top_n_preds]
    top_n_movies = movies_df[movies_df['movieId'].isin(top_n_movie_ids)].copy() # Use .copy() to avoid SettingWithCopyWarning

    #Adding predicted rating to the movie details
    predicted_ratings = {pred.iid: pred.est for pred in top_n_preds}
    top_n_movies['predicted_rating'] = top_n_movies['movieId'].map(predicted_ratings)

    return top_n_movies[['movieId', 'title', 'genres', 'predicted_rating']].sort_values(by='predicted_rating', ascending=False)


# Step 4: Get and display recommendations for a sample user
# Changing this user ID will give different recommendations
user_id_to_recommend = 196
top_movies = get_top_n_recommendations(user_id=user_id_to_recommend, n=10)

print(f"\n--- Top 10 Movie Recommendations for User {user_id_to_recommend} ---")
print(top_movies)

--- Model Performance Comparison ---
          Model      RMSE
2         SVD++  0.861429
0  BaselineOnly  0.872485
1           SVD  0.873817

Training the final SVD++ model on the entire dataset...
Training complete.

--- Top 10 Movie Recommendations for User 196 ---
      movieId                                           title  \
277       318                Shawshank Redemption, The (1994)   
841      1104                Streetcar Named Desire, A (1951)   
461       527                         Schindler's List (1993)   
1730     2324      Life Is Beautiful (La Vita è bella) (1997)   
224       260       Star Wars: Episode IV - A New Hope (1977)   
2582     3451             Guess Who's Coming to Dinner (1967)   
413       475                In the Name of the Father (1993)   
982      1283                                High Noon (1952)   
914      1213                               Goodfellas (1990)   
4025     5690  Grave of the Fireflies (Hotaru no haka) (1988)   

                

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from surprise import dump
import os

model_path = "/content/drive/My Drive/Colab Notebooks/models/svdpp_model.dump"

os.makedirs(os.path.dirname(model_path), exist_ok=True)

dump.dump(model_path, algo=final_model)
print(f"Model saved to Google Drive at: {model_path}")

Model saved to Google Drive at: /content/drive/My Drive/Colab Notebooks/models/svdpp_model.dump
