# Collaborative Filtering from Scratch Abbreviated - Item Based Version

In [1]:
import os
import pickle
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.model_selection import KFold, train_test_split
from sklearn.neighbors import NearestNeighbors

## 1. Utilities for Shifting Ratings

In [2]:
def dilate(X: sp.sparse.csc_matrix, amount: float) -> None:
    """
    deduct each nonempty entry of X by amount
    (this mutates the underlying csc matrix)
    for example, if take amount as baseline,
    this function is useful.
    """
    X.data = X.data - np.float64(amount)

def value_remap(X: sp.sparse.csc_matrix, remapper: np.ndarray) -> None:
    """
    remap nonnull values of X based on remapper
    (if remapper[i]=j, value of i is remapped to j)
    note this function takes an np array (not an ordinary list)
    """
    X.data = remapper[X.data.astype(int)].astype(np.float64)

def make_mean_0(X: sp.sparse.csc_matrix) -> None:
    """
    make columns of a csc_matrix have zero mean
    (this function mutates the underlying data)
    """
    X.data -= get_true_mean(X)

def get_true_mean(X: sp.sparse.csc_matrix) -> None:
    """
    compute mean of each column (over nonzero indices!)
    this won't mutate the underlying X
    a general helper function
    """
    indexer = X.tocoo().col
    v = np.array(X.sum(axis=0)).flatten()
    c = np.array(X.minimum(1).sum(axis=0)).flatten()
    return v[indexer] / np.maximum(c[indexer], 0.5)

def test_mean_is_0(X: sp.sparse.csc_matrix, tol=10 ** (-10)) -> np.bool_:
    """
    test if mean is 0 for each column
    should be true on outputs of make_mean_0
    """
    return np.abs(np.array(X.sum(axis=0))).max() < tol

## 2. Utilities for Top Similarity Scores and Neighbors

In [3]:
def get_top_neigh_dist_ind(
    ratings_train_csc_modified,
    ratings_test_csc_modified,
    n_neighbors=5
):
    knn = NearestNeighbors(metric="cosine")
    knn.fit(ratings_train_csc_modified.transpose())
    neigh_dist, neigh_ind = knn.kneighbors(
        X=ratings_test_csc_modified.transpose(),
        n_neighbors=n_neighbors
    )
    return 1 - neigh_dist, neigh_ind

## 3. Utilities for Ratings Prediction

In [4]:
def update_numer_denom(
    numer_test_csc_ratings,
    denom_test_csc_ratings,
    ratings_test_csc,
    ratings_train_csc,
    ratings_train_mean,
    n_items,
    n_users_test,
    pos # position of which neighbor to update
):
    """
    this function mutates the input csc matrices
    """
    start_ind = 0
    for i in np.arange(n_users_test):
        denom_col = np.zeros(n_items)
        numer_col = np.zeros(n_items)
        baseline_items = ratings_test_csc.indices[
            ratings_test_csc.indptr[i]: ratings_test_csc.indptr[i + 1]
        ]
        n_baseline_items = len(baseline_items)
        end_ind = start_ind + n_baseline_items
        j = neigh_ind[i, pos]
        sim_score = sim_scores[i, pos]
        r_v = ratings_train_csc[:, j]\
                .toarray().flatten()[baseline_items]
        mu_v = ratings_train_mean[j]
        w = sim_score * np.minimum(1, r_v)
        numer_test_csc_ratings.data[start_ind: end_ind] += w * (r_v - mu_v)
        denom_test_csc_ratings.data[start_ind: end_ind] += w
        start_ind = end_ind

In [5]:
def update_prediction(
    ratings_test_csc_predicted,
    ratings_test_mean,
    numer_test_csc_ratings,
    denom_test_csc_ratings
):
    ratings_test_csc_predicted.data =\
        ratings_test_mean[ratings_test_csc_predicted.tocoo().col] +\
        numer_test_csc_ratings.data /\
        np.maximum(
            denom_test_csc_ratings.data,
            10 ** (-30)
        )

## 4. Utilities for RMSE, MAE

In [6]:
def eval_error(ratings_diffs: np.ndarray, sense:str="RMSE"):
    """
    function for evaluating RMSE, MAE of a ratings_diffs array
    sense can be "RMSE" or "MAE", no other options for now
    """
    if sense not in {"RMSE", "MAE"}:
        raise NotImplementedError
    p = {"RMSE": 2, "MAE": 1}[sense]
    return np.linalg.norm(ratings_diffs, p) / ratings_diffs.shape[0] ** (1 / p)

We can probably do better than this.

## 3.5 Collaborative Filtering - Putting all together

Let us make clear on which models that we want to try:
1. Number of folds for CV - we fix this to be 10 for our task.
2. Metrics to use - we use shifting by 0 (ordinary cosine), 2.5, 2.75, 2.9, 3, adjusted cosine, and the mapping of 1-2 to -1, 3-5 to 1.
3. Number of neighborhoods to use - we use 1-200 (the execution time will at least be linear with 200).

The evaluation pipeline is as follows.

In [8]:
%%time
results = []

# specify metadata
metric_names = [
                "cosine", *[f"deduct {amount}" for amount in [2.5, 2.75, 2.9, 3]],
                "adjusted cosine",
                "remap 12->-1, 345->1",
                "remap 1,5->-+1, 24->-+0.5, 3->0.25"
            ]
metric_funcs = [
                lambda x: dilate(x, 0),
                lambda x: dilate(x, 2.5),
                lambda x: dilate(x, 2.75),
                lambda x: dilate(x, 2.9),
                lambda x: dilate(x, 3),
                make_mean_0,
                lambda x: value_remap(x, np.array([0, -1, -1, 1, 1, 1])),
                lambda x: value_remap(x, np.array([0, -1, -0.5, 0.25, 0.5, 1]))
            ]
n_neighbors = 200

# transform to csc matrix
csv_dir = "/home/zebalgebra/School/DVA/The-Last-Book-Bender/Data/Raw/"
ratings_all_df = pd.read_csv(
    os.path.join(csv_dir, "ratings.csv")
)
# since we do item-based, book_id and user_id are switched (books are columns)
ratings_all_csc = sp.sparse.csc_matrix(
    (
        ratings_all_df["rating"],
        (
            ratings_all_df["user_id"],
            ratings_all_df["book_id"]
        )
    )
)

# start k-fold
kf = KFold(n_splits=10, shuffle=True, random_state=6242)
user_ids = np.arange(10000 + 1)
for (fold, (user_ids_train, user_ids_test)) in enumerate(kf.split(user_ids)):
    print(f"fold={fold} started.")
    fold_time_start = time.time()
    ratings_train_csc = ratings_all_csc[:, user_ids_train]
    ratings_test_csc = ratings_all_csc[:, user_ids_test]
    ratings_train_mean = get_true_mean(ratings_train_csc)
    ratings_test_mean = get_true_mean(ratings_test_csc)
    n_items = ratings_test_csc.shape[0]
    n_users_test = len(user_ids_test)
    for metric_name, metric_func in zip(metric_names, metric_funcs):
        print(f" - fold={fold}, metric='{metric_name}' started.")
        # modify train, test csc matrix
        ratings_train_csc_modified = ratings_train_csc.copy().astype(np.float64)
        ratings_test_csc_modified = ratings_test_csc.copy().astype(np.float64)
        metric_func(ratings_train_csc_modified)
        metric_func(ratings_test_csc_modified)
        # get top neighbors and scores
        sim_scores, neigh_ind = get_top_neigh_dist_ind(
            ratings_train_csc_modified,
            ratings_test_csc_modified,
            n_neighbors=n_neighbors
        )
        # generate ratings
        ratings_test_csc_predicted = ratings_test_csc.copy()
        ratings_test_csc_predicted.data = np.zeros(len(ratings_test_csc.data))
        numer_test_csc_ratings = ratings_test_csc_predicted.copy()
        denom_test_csc_ratings = ratings_test_csc_predicted.copy()
        for pos in range(n_neighbors):
            update_numer_denom(
                numer_test_csc_ratings,
                denom_test_csc_ratings,
                ratings_test_csc,
                ratings_train_csc,
                ratings_train_mean,
                n_items,
                n_users_test,
                pos
            )
            update_prediction(
                ratings_test_csc_predicted,
                ratings_test_mean,
                numer_test_csc_ratings,
                denom_test_csc_ratings
            )
            ratings_diffs = ratings_test_csc_predicted.data - ratings_test_csc.data
            # evaluate errors
            rmse = eval_error(ratings_diffs, "RMSE")
            mae = eval_error(ratings_diffs, "MAE")
            # append to result
            results.append(
                {
                    "fold": fold,
                    "metric_name": metric_name,
                    "n_neighbors": pos + 1,
                    "rmse": rmse,
                    "mae": mae
                }
            )
    print(f"fold={fold} ended. used {time.time()-fold_time_start} seconds.")
    with open(f'checkpoint_fold_0-{fold}_item_based.pkl', 'wb') as f:
        pickle.dump(results, f)

fold=0 started.
 - fold=0, metric='cosine' started.
 - fold=0, metric='deduct 2.5' started.
 - fold=0, metric='deduct 2.75' started.
 - fold=0, metric='deduct 2.9' started.
 - fold=0, metric='deduct 3' started.
 - fold=0, metric='adjusted cosine' started.
 - fold=0, metric='remap 12->-1, 345->1' started.
 - fold=0, metric='remap 1,5->-+1, 24->-+0.5, 3->0.25' started.
fold=0 ended. used 848.5853264331818 seconds.
fold=1 started.
 - fold=1, metric='cosine' started.
 - fold=1, metric='deduct 2.5' started.
 - fold=1, metric='deduct 2.75' started.
 - fold=1, metric='deduct 2.9' started.
 - fold=1, metric='deduct 3' started.
 - fold=1, metric='adjusted cosine' started.
 - fold=1, metric='remap 12->-1, 345->1' started.
 - fold=1, metric='remap 1,5->-+1, 24->-+0.5, 3->0.25' started.
fold=1 ended. used 888.3474657535553 seconds.
fold=2 started.
 - fold=2, metric='cosine' started.
 - fold=2, metric='deduct 2.5' started.
 - fold=2, metric='deduct 2.75' started.
 - fold=2, metric='deduct 2.9' star

The testing results are saved in `checkpoint_fold_0-0_item_based.pkl` - `checkpoint_fold_0-10_item_based.pkl`.

The plot for RMSE and MAE are in `CF-CV-Results_item_based.ipynb`.