In [39]:
from surprise import SVD
from surprise import Dataset, NormalPredictor, Reader
from surprise.model_selection import cross_validate
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.float_format = "{:.4f}".format

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin("ml-100k")

# Run 5-fold cross-validation and print results.
cross_validate(SVD(), data, measures=["RMSE", "MSE", "MAE"], cv=3, verbose=True)


Evaluating RMSE, MSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9428  0.9454  0.9438  0.9440  0.0011  
MSE (testset)     0.8888  0.8939  0.8907  0.8911  0.0021  
MAE (testset)     0.7439  0.7455  0.7453  0.7449  0.0007  
Fit time          0.50    0.49    0.47    0.48    0.01    
Test time         0.12    0.12    0.12    0.12    0.00    


{'test_rmse': array([0.94278482, 0.94544952, 0.94377048]),
 'test_mse': array([0.88884321, 0.8938748 , 0.89070272]),
 'test_mae': array([0.74385613, 0.74548957, 0.74525875]),
 'fit_time': (0.5007278919219971, 0.48601698875427246, 0.4671149253845215),
 'test_time': (0.11835694313049316, 0.11984610557556152, 0.12009882926940918)}

In [40]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings_df = pd.read_csv(
    "./data/usable_user_item_ratings_prepared.csv.gz",
    compression="gzip",
    dtype=data_types,
)
print(user_item_ratings_df.shape)
max_rating = user_item_ratings_df.rating.max()
min_rating = user_item_ratings_df.rating.min()
display(user_item_ratings_df.head(3))

(1522154, 3)


Unnamed: 0,user_id,item_id,rating
0,U000003,I00037925,0.61
1,U000003,I00189384,0.61
2,U000003,I00256366,0.61


In [41]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(min_rating, max_rating))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(
    user_item_ratings_df[["user_id", "item_id", "rating"]], reader
)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(SVD(), data, measures=["RMSE", "MSE", "MAE"], cv=3, verbose=True)


Evaluating RMSE, MSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.0974  0.0973  0.0976  0.0974  0.0001  
MSE (testset)     0.0095  0.0095  0.0095  0.0095  0.0000  
MAE (testset)     0.0685  0.0684  0.0686  0.0685  0.0001  
Fit time          18.47   18.16   18.66   18.43   0.20    
Test time         3.67    3.11    3.51    3.43    0.24    


{'test_rmse': array([0.09740225, 0.09729641, 0.09757146]),
 'test_mse': array([0.0094872 , 0.00946659, 0.00952019]),
 'test_mae': array([0.06846195, 0.06843257, 0.0686222 ]),
 'fit_time': (18.469423055648804, 18.16381001472473, 18.66159415245056),
 'test_time': (3.6705398559570312, 3.1078100204467773, 3.513416051864624)}