In [36]:
from surprise import SVD
from surprise import Dataset, NormalPredictor, Reader
from surprise.model_selection import cross_validate
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.float_format = "{:.4f}".format

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin("ml-100k")

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=["RMSE", "MSE", "MAE"], cv=3, verbose=True)


Evaluating RMSE, MSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9471  0.9494  0.9415  0.9460  0.0033  
MSE (testset)     0.8970  0.9013  0.8865  0.8949  0.0062  
MAE (testset)     0.7502  0.7480  0.7433  0.7472  0.0029  
Fit time          0.55    0.55    0.55    0.55    0.00    
Test time         0.12    0.18    0.17    0.16    0.02    


{'test_rmse': array([0.94710856, 0.94936938, 0.94152899]),
 'test_mse': array([0.89701463, 0.90130222, 0.88647683]),
 'test_mae': array([0.75023223, 0.7480054 , 0.7432546 ]),
 'fit_time': (0.5517251491546631, 0.5546228885650635, 0.5545761585235596),
 'test_time': (0.12381267547607422, 0.1782059669494629, 0.16785907745361328)}

In [37]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings_df = pd.read_csv(
    "./data/usable_user_item_ratings_prepared.csv.gz",
    compression="gzip",
    dtype=data_types,
)
print(user_item_ratings_df.shape)
max_rating = user_item_ratings_df.rating.max()
min_rating = user_item_ratings_df.rating.min()
display(user_item_ratings_df.head(3))

(1522154, 3)


Unnamed: 0,user_id,item_id,rating
0,U000003,I00037925,0.61
1,U000003,I00189384,0.61
2,U000003,I00256366,0.61


In [38]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(min_rating, max_rating))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(
    user_item_ratings_df[["user_id", "item_id", "rating"]], reader
)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(algo, data, measures=["RMSE", "MSE", "MAE"], cv=3, verbose=True)


Evaluating RMSE, MSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.0974  0.0974  0.0976  0.0974  0.0001  
MSE (testset)     0.0095  0.0095  0.0095  0.0095  0.0000  
MAE (testset)     0.0685  0.0684  0.0686  0.0685  0.0001  
Fit time          19.24   19.39   19.58   19.40   0.14    
Test time         3.35    2.76    3.25    3.12    0.26    


{'test_rmse': array([0.09741097, 0.09735497, 0.09756615]),
 'test_mse': array([0.0094889 , 0.00947799, 0.00951915]),
 'test_mae': array([0.06846661, 0.06842848, 0.06862416]),
 'fit_time': (19.236568927764893, 19.39401602745056, 19.579569101333618),
 'test_time': (3.345428943634033, 2.762321710586548, 3.2537903785705566)}