Collaborative Filtering
* Item-Item 
* Centered Cosine Similarity
* Top-k

https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

## Import Data 

In [1]:
%matplotlib inline

import pandas as pd

r = pd.read_csv( 'ratings.csv' )

In [2]:
n_users = r.user_id.unique().shape[0]
n_books = r.book_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_books) + ' books')

53424 users
10000 books


## Construct Matrix

In [3]:
import numpy as np

# id starts from 1, but python starts at 0
ratings = np.zeros((n_books, n_users))
for row in r.itertuples():
    ratings[row[2]-1, row[1]-1] = row[3]
    
ratings

array([[0., 0., 0., ..., 4., 4., 4.],
       [0., 5., 0., ..., 5., 5., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [4]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100

print(f'Sparsity: {sparsity}%')

Sparsity: 1.118688042827194%


## Remove Item with Less Than 100 Reviews

In [5]:
origin_index = np.where(np.count_nonzero(ratings, axis=1) >= 100)
origin_index

(array([   0,    1,    2, ..., 9997, 9998, 9999]),)

In [6]:
ratings = ratings[origin_index]
ratings

array([[0., 0., 0., ..., 4., 4., 4.],
       [0., 5., 0., ..., 5., 5., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100

print(f'Sparsity: {sparsity}%')

Sparsity: 1.1682326329542163%


## Get Train and Test Set

Removing 10 ratings per item from the training set and placing them in the test set.

In [8]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for book in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[book, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
        train[book, test_ratings] = 0
        test[book, test_ratings] = ratings[book, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

train, test = train_test_split(ratings)

In [9]:
train.shape

(9511, 53424)

## Center Matrix

In [10]:
train_centered = train.copy()

def center_row(row):
    index = row.nonzero()
    avg = row[index].mean()
    row[index] = row[index] - avg
    
np.apply_along_axis(center_row, 1, train_centered)

train_centered

array([[ 0.        ,  0.        ,  0.        , ..., -0.27982979,
        -0.27982979, -0.27982979],
       [ 0.        ,  0.64876374,  0.        , ...,  0.64876374,
         0.64876374,  0.64876374],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.78582826],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## Calculate Similarity Matrix

In [11]:
def similarity(ratings, kind='item'):
    if kind == 'item':
        sim = ratings.dot(ratings.T)
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

item_similarity = similarity(train_centered, kind='item')
print(item_similarity[:4, :4])

[[1.         0.15714238 0.18577988 0.03230397]
 [0.15714238 1.         0.08314043 0.05775278]
 [0.18577988 0.08314043 1.         0.01496041]
 [0.03230397 0.05775278 0.01496041 1.        ]]


## Calculate MSE Error

In [12]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

## Predict Result

In [13]:
def predict_topk(ratings, similarity, kind='item', k=10):
    pred = np.zeros(ratings.shape)
    if kind == 'item':
        for i in range(ratings.shape[0]):
            top_k_items = [np.argsort(similarity[:, i])[:-k-1:-1]]
            similarity_sum = np.sum(similarity[i, :][top_k_items])
            for j in range(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_items].dot(ratings[:, j][top_k_items])
                pred[i, j] /= similarity_sum
    return pred

In [None]:
pred = predict_topk(train, item_similarity, kind='item', k=10)
print('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

  
  


top_k_items = np.argsort(item_similarity, axis=1)
top_k_items

ratings_top_k = train.copy()