In [45]:
from surprise import SVD
from surprise import Dataset, NormalPredictor, Reader
from surprise.model_selection import cross_validate
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.float_format = "{:.4f}".format

In [46]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv(
    "./data/usable_user_item_ratings_prepared.csv.gz",
    compression="gzip",
    dtype=data_types,
)
print(user_item_ratings.shape)
max_rating = user_item_ratings.rating.max()
min_rating = user_item_ratings.rating.min()
user_item_ratings.head(3)

(1522154, 3)


Unnamed: 0,user_id,item_id,rating
0,U000003,I00037925,0.61
1,U000003,I00189384,0.61
2,U000003,I00256366,0.61


In [47]:
MAX_INTERACTIONS_PER_USER = 500
SAMPLE_SIZE = 50000

user_item_ratings_sample = (
    (
        user_item_ratings.groupby("user_id")
        .head(MAX_INTERACTIONS_PER_USER)
        .reset_index(drop=True)
    )
    .head(SAMPLE_SIZE)
    .reset_index(drop=True)
)


In [49]:
# A reader is still needed but only the rating_scale param is required.
reader = Reader(rating_scale=(min_rating, max_rating))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(
    user_item_ratings_sample[["user_id", "item_id", "rating"]], reader
)

# We can now use this dataset as we please, e.g. calling cross_validate
svd_results = cross_validate(SVD(), data, measures=["RMSE", "MSE", "MAE"], cv=3, verbose=True)


Evaluating RMSE, MSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.0988  0.0999  0.1001  0.0996  0.0006  
MSE (testset)     0.0098  0.0100  0.0100  0.0099  0.0001  
MAE (testset)     0.0762  0.0763  0.0761  0.0762  0.0001  
Fit time          0.45    0.47    0.47    0.46    0.01    
Test time         0.04    0.05    0.05    0.05    0.00    


In [None]:
from surprise import AlgoBase
from pipeliner.recommendations.transformer import (
    UserItemMatrixTransformerNP,
    SimilarityTransformerNP,
)
from pipeliner.recommendations.recommender import (
    SimilarityRecommenderNP,
)

class SurpriseSimilarityRecommender(AlgoBase):
    def __init__(self):
        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def fit(self, trainset):
        # Here again: call base method before doing anything.
        AlgoBase.fit(self, trainset)

        # Compute the average rating. We might as well use the
        # trainset.global_mean attribute ;)
        self.the_mean = np.mean([r for (_, _, r) in self.trainset.all_ratings()])

        return self

    def estimate(self, u, i):
        return self.the_mean