Collaborative Filtering
* Item-Item 
* Centered Cosine Similarity
* Top-k

https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

## Import Data 

In [2]:
%matplotlib inline

import pandas as pd

r = pd.read_csv( 'ratings.csv' )

In [2]:
n_users = r.user_id.unique().shape[0]
n_books = r.book_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_books) + ' books')

53424 users
10000 books


## Construct Matrix

In [3]:
import numpy as np

# id starts from 1, but python starts at 0
ratings = np.zeros((n_books, n_users))
for row in r.itertuples():
    ratings[row[2]-1, row[1]-1] = row[3]
    
ratings

array([[0., 0., 0., ..., 4., 4., 4.],
       [0., 5., 0., ..., 5., 5., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [4]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100

print(f'Sparsity: {sparsity}%')

Sparsity: 1.118688042827194%


## Remove Item with Less Than 100 Reviews

In [1]:
book_index = np.where(np.count_nonzero(ratings, axis=1) >= 100)
print(len(book_index[0]))
book_index

NameError: name 'np' is not defined

In [10]:
ratings = ratings[book_index]
ratings

array([[0., 0., 0., ..., 4., 4., 4.],
       [0., 5., 0., ..., 5., 5., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100

print(f'Sparsity: {sparsity}%')

Sparsity: 1.1682326329542163%


## Remove User with Less Than 100 Reviews

## Get Train and Test Set

Removing 10 ratings per item from the training set and placing them in the test set.

In [12]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for book in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[book, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
        train[book, test_ratings] = 0
        test[book, test_ratings] = ratings[book, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

train, test = train_test_split(ratings)

In [13]:
train.shape

(9511, 53424)

## Center Matrix

In [14]:
train_centered = train.copy()

def center_row(row):
    index = row.nonzero()
    avg = row[index].mean()
    row[index] = row[index] - avg
    
np.apply_along_axis(center_row, 1, train_centered)

train_centered

array([[ 0.        ,  0.        ,  0.        , ..., -0.27961046,
        -0.27961046, -0.27961046],
       [ 0.        ,  0.6485348 ,  0.        , ...,  0.6485348 ,
         0.6485348 ,  0.6485348 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.78576916],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## Calculate Similarity Matrix

In [15]:
def similarity(ratings, kind='item'):
    if kind == 'item':
        sim = ratings.dot(ratings.T)
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

item_similarity = similarity(train_centered, kind='item')
print(item_similarity[:4, :4])

[[1.         0.15735119 0.18613949 0.03205175]
 [0.15735119 1.         0.0825957  0.05801356]
 [0.18613949 0.0825957  1.         0.01468526]
 [0.03205175 0.05801356 0.01468526 1.        ]]


## Calculate MSE Error

In [16]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

## Predict Result

### Get Order Array

In [24]:
item_similarity[0][:5]

array([1.        , 0.15735119, 0.18613949, 0.03205175, 0.03537167])

In [23]:
order = (-item_similarity).argsort(axis=1).argsort(axis=1)
order[0][:5]

array([  0,   5,   3, 229, 174])

### Get Boolean Array where False for order < K and True for order >= K

In [50]:
K = 40
order = order >= K
order[0][:5]

array([False, False, False, False, False])

### Set similarity term to 0 if order >= K

In [51]:
print(f'order has shape {order.shape} item_similarity has shape{item_similarity.shape}')

order has shape (9511, 9511) item_similarity has shape(9511, 9511)


In [52]:
TopK_item_similarity = item_similarity.copy()
TopK_item_similarity[order] = 0
TopK_item_similarity[0][:5]

array([1.        , 0.15735119, 0.18613949, 0.03205175, 0.03537167])

### Divide each row by sum of similarities

In [53]:
TopK_item_similarity = TopK_item_similarity / TopK_item_similarity.sum(axis=1, keepdims=True)
TopK_item_similarity[0][:5]

array([0.01265834, 0.0019918 , 0.00235622, 0.00040572, 0.00044775])

In [54]:
TopK_item_similarity.sum(axis=1)

array([1., 1., 1., ..., 1., 1., 1.])

### Multiply TopK_item_similarity and train

In [55]:
TopK_item_similarity.shape

(9511, 9511)

In [56]:
train.shape

(9511, 53424)

In [57]:
pred = np.matmul(TopK_item_similarity, train)
pred

array([[0.13392613, 0.10934208, 0.02818714, ..., 0.30714447, 0.18320987,
        0.33258676],
       [0.13328426, 0.26875125, 0.03381595, ..., 0.38512052, 0.30174229,
        0.41186553],
       [0.11280134, 0.09083479, 0.02379928, ..., 0.19150582, 0.10588441,
        0.33460785],
       ...,
       [0.07708182, 0.07702455, 0.02854496, ..., 0.09202084, 0.09461585,
        0.13009529],
       [0.08780229, 0.12356559, 0.02316259, ..., 0.10651537, 0.06732333,
        0.11883065],
       [0.09372853, 0.0907222 , 0.02844619, ..., 0.09867859, 0.04558335,
        0.13145354]])

In [58]:
print('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

Top-k Item-based CF MSE: 13.505511551189256


In [62]:
test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])