Collaborative Filtering
* Item-Item 
* Centered Cosine Similarity
* Top-k

https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

## Import Data 

In [1]:
%matplotlib inline

import pandas as pd

r = pd.read_csv( 'ratings.csv' )

In [2]:
n_users = r.user_id.unique().shape[0]
n_books = r.book_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_books) + ' books')

53424 users
10000 books


## Construct Matrix

In [3]:
import numpy as np

# id starts from 1, but python starts at 0
ratings = np.zeros((n_books, n_users))
for row in r.itertuples():
    ratings[row[2]-1, row[1]-1] = row[3]
    
ratings

array([[0., 0., 0., ..., 4., 4., 4.],
       [0., 5., 0., ..., 5., 5., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [4]:
def get_sparsity(ratings):
    sparsity = float(len(ratings.nonzero()[0]))
    sparsity /= (ratings.shape[0] * ratings.shape[1])
    sparsity *= 100
    return sparsity

In [5]:
print(f'Sparsity: {get_sparsity(ratings)}%')

Sparsity: 1.118688042827194%


## Only keep top 1000 books

In [6]:
N = 1000
book_index = np.argpartition(np.count_nonzero(ratings, axis=1), -N)[-N:]
print(len(book_index))

1000


In [7]:
ratings = ratings[book_index]

In [8]:
book_stat = np.count_nonzero(ratings, axis=1)
print(f'min:  {book_stat.min()}\nmean: {book_stat.mean()}\nmax:  {book_stat.max()}')

min:  1183
mean: 3179.65
max:  22806


In [9]:
print(f'Sparsity: {get_sparsity(ratings)}%')

Sparsity: 5.951725816112609%


## Only keep top 10000 users

In [10]:
N = 10000
user_index = np.argpartition(np.count_nonzero(ratings, axis=0), -N)[-N:]
print(len(user_index))

10000


In [11]:
ratings = ratings[:, user_index]
ratings.shape

(1000, 10000)

In [12]:
user_stat = np.count_nonzero(ratings, axis=0)
print(f'min:  {user_stat.min()}\nmean: {user_stat.mean()}\nmax:  {user_stat.max()}')

min:  82
mean: 98.3166
max:  177


In [13]:
print(f'Sparsity: {get_sparsity(ratings)}%')

Sparsity: 9.831660000000001%


## Get Train and Test Set

Removing 10 ratings per item from the training set and placing them in the test set.

In [14]:
ratings.nonzero()

(array([  0,   0,   0, ..., 999, 999, 999], dtype=int64),
 array([  78,  124,  192, ..., 9996, 9997, 9999], dtype=int64))

In [15]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for book in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[book, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
        train[book, test_ratings] = 0
        test[book, test_ratings] = ratings[book, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

train, test = train_test_split(ratings)

In [16]:
train.shape

(1000, 10000)

## Center Matrix

In [17]:
train_centered = train.copy()

def center_row(row):
    index = row.nonzero()
    avg = row[index].mean()
    row[index] = row[index] - avg
    
np.apply_along_axis(center_row, 1, train_centered)

train_centered

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.38411371,  0.61588629, -0.38411371, ...,  0.        ,
        -0.38411371,  0.61588629],
       [-0.00765957, -0.00765957, -1.00765957, ...,  0.99234043,
         0.        ,  0.99234043],
       [ 0.7206637 ,  0.7206637 , -1.2793363 , ..., -0.2793363 ,
         0.        , -0.2793363 ]])

## Calculate Similarity Matrix

In [18]:
def similarity(ratings, kind='item'):
    if kind == 'item':
        sim = ratings.dot(ratings.T)
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

item_similarity = similarity(train_centered, kind='item')
print(item_similarity[:4, :4])

[[ 1.00000000e+00  1.65940674e-05 -6.09128257e-03  2.88873422e-04]
 [ 1.65940674e-05  1.00000000e+00  4.83012144e-02  0.00000000e+00]
 [-6.09128257e-03  4.83012144e-02  1.00000000e+00  0.00000000e+00]
 [ 2.88873422e-04  0.00000000e+00  0.00000000e+00  1.00000000e+00]]


## Calculate MSE Error

In [19]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

## Predict Result (Slow because of for loop)

In [28]:
np.where(train[:, 0] != 0)[0]

array([ 74,  79,  98, 123, 156, 171, 263, 375, 409, 422, 437, 491, 507,
       516, 531, 536, 551, 569, 573, 594, 673, 676, 681, 704, 706, 728,
       733, 734, 754, 762, 766, 781, 797, 798, 803, 808, 828, 848, 849,
       850, 852, 854, 855, 856, 866, 874, 876, 889, 893, 910, 914, 921,
       932, 933, 939, 943, 953, 954, 955, 960, 962, 965, 966, 967, 969,
       970, 975, 976, 978, 980, 983, 984, 988, 989, 991, 992, 994, 995,
       996, 997, 998, 999], dtype=int64)

In [55]:
def predict(item_similarity, train, K=20):
    pred = np.zeros(train.shape)
    for i in range(train.shape[0]):
        for j in range(train.shape[1]):
            # Already rated by user
            if (train[i][j] != 0):
                pred[i][j] = train[i][j]
                continue
            # Note rated by user
            user_nonzero = np.where(train[:, j] != 0)[0]
            similarity = item_similarity[i][user_nonzero]
            topK = np.argpartition(similarity, -K)[-K:]
            pred[i][j] = np.dot(similarity[topK], train[user_nonzero[topK], j]) / similarity[topK].sum()
    return pred

In [56]:
result = predict(item_similarity, train)

In [57]:
result[0]

array([3.65770144, 5.        , 3.54236848, ..., 3.46014591, 3.89518618,
       4.65031711])

## Calculate MSE

In [59]:
print('Top-k Item-based CF MSE: ' + str(get_mse(result, test)))

Top-k Item-based CF MSE: 0.6989419335900491
