# Collaborative Filtering Notebook with `surprise` (demo)

## Model Training

These lines loads the ratings and convert it to `surprise.Dataset` object.

In [5]:
from surprise import Dataset, NormalPredictor, Reader
import pandas as pd
import numpy as np
ratings_df = pd.read_csv(
    filepath_or_buffer="../Data/ratings.csv",
    dtype={
        "user_id": "Int32",
        "book_id": "Int32",
        "rating": "Int8"
    }
)

Subset the data

In [8]:
# Get unique user IDs
unique_user_ids = ratings_df['user_id'].unique()

# Randomly select 500 user IDs
random_user_ids = np.random.choice(unique_user_ids, size=500, replace=False)


# Create a new DataFrame containing only the rows with the randomly selected user IDs
subset_ratings_df = ratings_df[ratings_df['user_id'].isin(random_user_ids)]

# Print the first few rows of the subset DataFrame
print(subset_ratings_df.head())

# Optional: Print the shape of the subset DataFrame to verify the number of rows
print(subset_ratings_df.shape)

ratings_sdata = Dataset.load_from_df(
    df=subset_ratings_df,
    reader=Reader(rating_scale=(1, 5))
)

      user_id  book_id  rating
1829      123        5       4
1830      123       26       5
1831      123        4       3
1832      123       58       3
1833      123       32       4
(55943, 3)


## Suprise Alt Approach


In [27]:
book_pivot = subset_ratings_df.pivot_table(columns='user_id',index='book_id',values='rating')
book_pivot.shape
book_pivot.fillna(0,inplace=True)

from scipy.sparse import csr_matrix
book_sparse=csr_matrix(book_pivot.astype(float))
type(book_sparse)

scipy.sparse._csr.csr_matrix

In [28]:
book_pivot

user_id,123,216,281,283,567,821,823,905,1011,1041,...,52701,52758,52822,53002,53022,53060,53063,53364,53392,53413
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,5.0,5.0,4.0,0.0,5.0,0.0,0.0,4.0,...,0.0,5.0,0.0,4.0,4.0,4.0,0.0,0.0,0.0,0.0
2,0.0,4.0,0.0,4.0,0.0,3.0,5.0,5.0,4.0,0.0,...,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
3,5.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,4.0,0.0,3.0,0.0,5.0,0.0,5.0,0.0,0.0,...,0.0,5.0,0.0,5.0,4.0,2.0,0.0,4.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,5.0,4.0,3.0,0.0,0.0,...,0.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


sklearn nearestneighbors

In [23]:
from sklearn.neighbors import NearestNeighbors
model=NearestNeighbors(algorithm='brute') ## model

model.fit(book_sparse)

In [24]:
distances,suggestions = model.kneighbors(book_pivot.iloc[1,:].values.reshape(1,-1))

In [25]:
distances

array([[ 0.        , 46.08687449, 47.2546294 , 47.6235236 , 48.14561247]])

In [26]:
suggestions

array([[ 1, 17, 22, 23, 20]])

In [29]:
for i in range(len(suggestions)):
    print(book_pivot.index[suggestions[i]])

Index([2, 18, 23, 24, 21], dtype='Int32', name='book_id')


We can choose a similarity metric for CF, given by the `surprise.similarities` module.

In [14]:
from surprise import KNNBasic
sim_options = {
    "name": "cosine", # options: cosine, msd, pearson, pearson_baseline
    "user_based": True,  # False=CF on item; True=CF on user
    "shrinkage": 0, # takes effect if "name" set to pearson_baseline, can prevent overfit
    "min_support": 1, # if num of common ratings is less than this, truncates to 0, reduces user-item matrix density
}
algo = KNNBasic(sim_options=sim_options) # other algo options: https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html

## Top-N Recommendations

In [11]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


Get the top-n recommendations

In [15]:
# First train an KNNBasic algorithm on the movielens dataset.
data = ratings_sdata
trainset = data.build_full_trainset()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Computing the cosine similarity matrix...
Done computing similarity matrix.
123 [6285, 7603, 6277, 7872, 7377, 9626, 8880, 4228, 6089, 8239]
283 [7603, 6277, 7872, 7377, 9626, 8880, 4228, 6089, 8239, 5865]
823 [6285, 7603, 6277, 7872, 4228, 6089, 8239, 5865, 9995, 5804]
905 [6285, 7603, 7377, 9626, 8880, 6089, 8239, 5865, 9995, 4727]
1011 [6277, 7872, 7377, 9626, 8880, 4228, 8239, 5865, 9995, 5804]
1041 [6285, 7603, 6277, 7872, 7377, 9626, 8880, 4228, 8239, 5865]
1421 [6285, 7603, 7377, 9626, 8880, 4228, 8239, 5865, 2235, 4728]
1274 [6285, 6277, 2727, 7377, 9626, 8880, 1886, 4228, 1969, 8239]
1981 [7603, 6277, 7872, 6089, 8239, 5865, 9995, 5804, 4727, 8390]
2007 [7603, 6277, 7872, 7377, 9626, 8880, 4228, 6089, 8239, 5865]
2294 [6285, 7603, 7662, 6277, 4120, 2727, 6701, 7377, 9626, 8880]
2498 [6285, 7603, 6277, 7872, 7377, 9626, 8880, 6089, 9995, 5804]
1876 [6285, 6277, 7872, 7377, 9626, 8880, 4228, 1969, 6089, 8239]
2764 [6285, 7603, 6277, 7872, 7377, 9626, 8880, 4228, 6089, 8239]
3014

To train, we need to convert a `surprise.Dataset` object to a `surprise.Trainset` object.\
Suppose we want to train on the whole dataset:

In [10]:
ratings_strain = ratings_sdata.build_full_trainset()

Combining `algo` and `ratings_strain`:

In [60]:
import time
start_time = time.time()
algo.fit(ratings_strain)
print(f"Ellapsed time: {time.time() - start_time} seconds")

Computing the cosine similarity matrix...
Done computing similarity matrix.
Ellapsed time: 43.53758215904236 seconds


## Compute Predictions, Training Loss

One can test the train loss over itself. Build first a trainset from the testset, make prediction, then plug in accuracy metrics\
The prediction step will take a big while if predicting over the entire training set (6 million ratings available).

In [80]:
ratings_stest = ratings_strain.build_testset()
start_time = time.time()
predictions = algo.test(ratings_stest)
print(f"Ellapsed time: {time.time() - start_time} seconds")

Ellapsed time: 1157.4575111865997 seconds


A peak to `predictions`:

In [100]:
print(len(predictions))
for x in predictions[12345:12348]:
    print(x)

5976479
user: 219        item: 4338       r_ui = 4.00   est = 4.15   {'actual_k': 40, 'was_impossible': False}
user: 219        item: 190        r_ui = 3.00   est = 4.05   {'actual_k': 40, 'was_impossible': False}
user: 219        item: 4629       r_ui = 3.00   est = 4.03   {'actual_k': 40, 'was_impossible': False}


where `r_ui` is true rating and `est` is estimated rating.

Training losses can be computed by `surprise.accuracy`'s functions; here the all four options are listed.

In [101]:
from surprise.accuracy import rmse, mse, mae, fcp
start_time = time.time()
for n, f in [
    ("Root mean square error", rmse),
    ("Mean squared error", mse),
    ("Mean absolute error", mae),
    ("Fraction of concordant paris", fcp)
]:
    print(f"{n}: {f(predictions, verbose=False)}")
print(f"Ellapsed time evaluating losses: {time.time() - start_time} seconds")

Root mean square error: 0.7923006406161057
Mean squared error: 0.6277403051206915
Mean absolute error: 0.6057735404799971
Fraction of concordant paris: 0.8216831080054721
Ellapsed time evaluating losses: 63.667232513427734 seconds


## Query KNN (for items)

We trained the model with `user_based=False`, so we can try to query similar book items (may compare this to the corresponding section of the `bert` notebook). Let us look at the most similar books for the most popular ones:

In [116]:
books_top_3_ids_df = ratings_df\
                        .groupby("book_id", as_index=False)\
                        .agg(count=("user_id", "count"))\
                        .head(3)
books_top_3_ids_df

Unnamed: 0,book_id,count
0,1,22806
1,2,21850
2,3,16931


Book infos retrieved and joined from other dataset:

In [132]:
books_df = pd.read_csv(
    filepath_or_buffer="../Data/Raw/books_enriched.csv"
)[["book_id", "authors", "title"]]
books_df.merge(
    right=books_top_3_ids_df,
    left_on=["book_id"],
    right_on=["book_id"]
)

Unnamed: 0,book_id,authors,title,count
0,1,['Suzanne Collins'],"The Hunger Games (The Hunger Games, #1)",22806
1,2,"['J.K. Rowling', 'Mary GrandPré']",Harry Potter and the Sorcerer's Stone (Harry P...,21850
2,3,['Stephenie Meyer'],"Twilight (Twilight, #1)",16931


then we can call `algo.get_neighbors()`.

In [145]:
books_top_3_ids = books_top_3_ids_df["book_id"].to_list()
reccomend_top_3_for_top_3 = [algo.get_neighbors(book_id, k=3) for book_id in books_top_3_ids]
reccomend_top_3_for_top_3

[[10, 57, 60], [11, 157, 246], [10, 11, 29]]

Inferring reccomendations book titles:

In [150]:
books_id_name_dict = dict(zip(books_df.book_id, books_df.title))
for book_id, rec_ids in zip(books_top_3_ids, reccomend_top_3_for_top_3):
    print(f"Top three titles for {books_id_name_dict[book_id]}:")
    print(f"{[books_id_name_dict[rec_id] for rec_id in rec_ids]}")

Top three titles for The Hunger Games (The Hunger Games, #1):
['Pride and Prejudice', 'The Secret Life of Bees', 'The Curious Incident of the Dog in the Night-Time']
Top three titles for Harry Potter and the Sorcerer's Stone (Harry Potter, #1):
['The Kite Runner', 'Green Eggs and Ham', 'Marked (House of Night, #1)']
Top three titles for Twilight (Twilight, #1):
['Pride and Prejudice', 'The Kite Runner', 'Romeo and Juliet']


Seems to be a bit irrelevant.

## Dump Results

`surprise` provides a `dump` module for dumping and loading models.\
For our model size, dumping would take up a long time.

In [84]:
surprise.dump.dump(
    "../Data/Dump/cf_knnbasic_all.dump",
    predictions=predictions,
    algo=algo,
    verbose=1
)

The dump has been saved as file ../Data/Dump/cf_knnbasic_all.dump


Dump files can be retrieved by `dump.load(filename)`.