# Item-based collaborative filtering

Here, I'm going to compute item similarities using only user interactions.

In effect, similarity is based on unknown, latent features of each user and item. These features represent qualities of users and items that make users likely or unlikely to interact with items.

## Memory-based approach

This method does not generate a model or reduce dimensions, so it does not scale well to large datasets. Similarity is computed using the Scikit Learn `cosine_similarity` function.

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
USE_SAVED_DATA = True

In [14]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.preprocessing import LabelEncoder

from pipeliner.recommendations.transformer import (
    SimilarityTransformerNP,
    UserItemMatrixTransformerNP,
)
from pipeliner.recommendations.recommender import SimilarityRecommenderNP

pd.options.display.float_format = "{:,.2f}".format

In [15]:
data_types = {"user_id": str, "item_id": str, "rating": np.float64}
user_item_ratings = pd.read_csv(
    "./data/usable_user_item_ratings_prepared.csv.gz",
    compression="gzip",
    dtype=data_types,
)

# confirm that each user/item pair is unique
assert user_item_ratings.groupby(["user_id", "item_id"]).size().max() == 1

print(user_item_ratings.shape)
user_item_ratings.head(3)

(1522154, 3)


Unnamed: 0,user_id,item_id,rating
0,U000003,I00037925,0.61
1,U000003,I00189384,0.61
2,U000003,I00256366,0.61


In [16]:
# create a smaller dataset for the memory-based recommender
MAX_INTERACTIONS_PER_USER = 500
SAMPLE_SIZE = 50000

user_item_ratings_sample = (
    (
        user_item_ratings.groupby("user_id")
        .head(MAX_INTERACTIONS_PER_USER)
        .reset_index(drop=True)
    )
    .head(SAMPLE_SIZE)
    .reset_index(drop=True)
)

print(user_item_ratings_sample.shape)
user_item_ratings_sample.head(3)

(50000, 3)


Unnamed: 0,user_id,item_id,rating
0,U000003,I00037925,0.61
1,U000003,I00189384,0.61
2,U000003,I00256366,0.61


In [17]:
# encode the user and item ids
user_sample_encoder = LabelEncoder()
item_sample_encoder = LabelEncoder()

user_item_ratings_sample["user_id"] = user_sample_encoder.fit_transform(user_item_ratings_sample["user_id"])
user_item_ratings_sample["item_id"] = item_sample_encoder.fit_transform(user_item_ratings_sample["item_id"])

unique_sample_users = pd.Series(user_sample_encoder.classes_)
unique_sample_items = pd.Series(item_sample_encoder.classes_)

print(unique_sample_users.shape[0], unique_sample_items.shape[0])
user_item_ratings_sample.head(3)

1334 45021


Unnamed: 0,user_id,item_id,rating
0,0,2546,0.61
1,0,10854,0.61
2,0,13742,0.61


In [18]:
if USE_SAVED_DATA is False:
        # create the user/item matrix
    user_item_matrix_transformer = UserItemMatrixTransformerNP()
    user_item_matrix_sample = user_item_matrix_transformer.transform(
        user_item_ratings_sample.to_numpy(),
    )
else:
    user_item_matrix_sample = sp.load_npz("data/03-collaborative-filtering/user_item_matrix_sample.npz")

# check ratings from matrix are correct
users = user_item_ratings_sample["user_id"].to_numpy().astype(int)
items = user_item_ratings_sample["item_id"].to_numpy().astype(int)
ratings = user_item_ratings_sample["rating"].to_numpy().astype(np.float32)
for user, item, rating in zip(users, items, ratings):
    assert user_item_matrix_sample[user, item] == rating

print(user_item_matrix_sample.shape)

(1334, 45021)


In [19]:
if USE_SAVED_DATA is False:
    similarity_matrix_transformer = SimilarityTransformerNP()
    user_similarity_matrix_sample = similarity_matrix_transformer.transform(
        user_item_matrix_sample
    )
    item_similarity_matrix_sample = similarity_matrix_transformer.transform(
        user_item_matrix_sample.T
    )
else:
    user_similarity_matrix_sample = sp.load_npz("data/03-collaborative-filtering/user_similarity_matrix_sample.npz")
    item_similarity_matrix_sample = sp.load_npz("data/03-collaborative-filtering/item_similarity_matrix_sample.npz")
    
user_similarity_matrix_sample.shape, item_similarity_matrix_sample.shape

((1334, 1334), (45021, 45021))

In [20]:
if USE_SAVED_DATA is False:
    sp.save_npz(
        "data/03-collaborative-filtering/user_item_matrix_sample.npz",
        user_item_matrix_sample,
        compressed=True,
    )
    sp.save_npz(
        "data/03-collaborative-filtering/user_similarity_matrix_sample.npz", 
        user_similarity_matrix_sample, 
        compressed=True
    )
    sp.save_npz(
        "data/03-collaborative-filtering/item_similarity_matrix_sample.npz", 
        item_similarity_matrix_sample, 
        compressed=True
    )

Generate item recomendation

In [22]:
item_idx = item_sample_encoder.transform(["I00037925"])

similarity_recommender = SimilarityRecommenderNP(10)
similarity_recommender.fit(item_similarity_matrix_sample)
recommendatons = similarity_recommender.predict(item_idx)

results = [
    item_sample_encoder.inverse_transform(item).tolist() for item in recommendatons
][0]

probabilities = similarity_recommender.predict_proba(item_idx).toarray()[0]

results, probabilities

(['I00189384',
  'I00267268',
  'I00298191',
  'I00316072',
  'I00318138',
  'I00590125',
  'I00758300',
  'I00846427',
  'I00891970',
  'I00944931'],
 array([0., 0., 0., ..., 0., 0., 0.], shape=(45021,)))