Collaborative Filtering
* Item-Item 
* Centered Cosine Similarity
* Top-k

https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

## Import Data 

In [1]:
%matplotlib inline

import pandas as pd

r = pd.read_csv( 'ratings.csv' )

In [2]:
n_users = r.user_id.unique().shape[0]
n_books = r.book_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_books) + ' books')

53424 users
10000 books


## Construct Matrix

In [3]:
import numpy as np

# id starts from 1, but python starts at 0
ratings = np.zeros((n_books, n_users))
for row in r.itertuples():
    ratings[row[2]-1, row[1]-1] = row[3]
    
ratings

array([[0., 0., 0., ..., 4., 4., 4.],
       [0., 5., 0., ..., 5., 5., 5.],
       [0., 0., 0., ..., 0., 0., 4.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [4]:
def get_sparsity(ratings):
    sparsity = float(len(ratings.nonzero()[0]))
    sparsity /= (ratings.shape[0] * ratings.shape[1])
    sparsity *= 100
    return sparsity

In [5]:
print(f'Sparsity: {get_sparsity(ratings)}%')

Sparsity: 1.118688042827194%


## Only keep top 1000 books

In [6]:
N = 1000
book_index = np.argpartition(np.count_nonzero(ratings, axis=1), -N)[-N:]
print(len(book_index))

1000


In [7]:
ratings = ratings[book_index]

In [8]:
book_stat = np.count_nonzero(ratings, axis=1)
print(f'min:  {book_stat.min()}\nmean: {book_stat.mean()}\nmax:  {book_stat.max()}')

min:  1183
mean: 3179.65
max:  22806


In [9]:
print(f'Sparsity: {get_sparsity(ratings)}%')

Sparsity: 5.951725816112609%


## Only keep top 10000 users

In [10]:
N = 10000
user_index = np.argpartition(np.count_nonzero(ratings, axis=0), -N)[-N:]
print(len(user_index))

10000


In [11]:
ratings = ratings[:, user_index]
ratings.shape

(1000, 10000)

In [12]:
user_stat = np.count_nonzero(ratings, axis=0)
print(f'min:  {user_stat.min()}\nmean: {user_stat.mean()}\nmax:  {user_stat.max()}')

min:  82
mean: 98.3166
max:  177


In [13]:
print(f'Sparsity: {get_sparsity(ratings)}%')

Sparsity: 9.831660000000001%


## Get Train and Test Set

Removing 10 ratings per item from the training set and placing them in the test set.

In [14]:
ratings.nonzero()

(array([  0,   0,   0, ..., 999, 999, 999]),
 array([  78,  124,  192, ..., 9996, 9997, 9999]))

In [15]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for book in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[book, :].nonzero()[0], 
                                        size=10, 
                                        replace=False)
        train[book, test_ratings] = 0
        test[book, test_ratings] = ratings[book, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

train, test = train_test_split(ratings)

In [16]:
train.shape

(1000, 10000)

## Center Matrix

In [17]:
train_centered = train.copy()

def center_row(row):
    index = row.nonzero()
    avg = row[index].mean()
    row[index] = row[index] - avg
    
np.apply_along_axis(center_row, 1, train_centered)

train_centered

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.38344482,  0.61655518, -0.38344482, ...,  0.        ,
        -0.38344482,  0.61655518],
       [-0.00787234, -0.00787234, -1.00787234, ...,  0.99212766,
         0.        ,  0.99212766],
       [ 0.71912419,  0.71912419, -1.28087581, ..., -0.28087581,
         0.        , -0.28087581]])

## Calculate Similarity Matrix

In [18]:
def similarity(ratings, kind='item'):
    if kind == 'item':
        sim = ratings.dot(ratings.T)
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

item_similarity = similarity(train_centered, kind='item')
print(item_similarity[:4, :4])

[[ 1.00000000e+00  7.00436084e-06 -6.17233148e-03 -1.48636761e-04]
 [ 7.00436084e-06  1.00000000e+00  5.96565906e-02  0.00000000e+00]
 [-6.17233148e-03  5.96565906e-02  1.00000000e+00  0.00000000e+00]
 [-1.48636761e-04  0.00000000e+00  0.00000000e+00  1.00000000e+00]]


## Calculate MSE Error

In [19]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

## Predict Result (Slow because of for loop)

In [25]:
np.argpartition(train[0], -4)[-4:]

array([4351, 2682, 9394, 6757])

In [26]:
train[0][4351]

5.0

In [20]:
def predict(item_similarity, , train, ratings_bool)
for i in range(rating.shape[0]):
    for j in range(ratings.shape[1]):
        book = item_similarity[i][ratings_bool[:j]]

(1000, 10000)

## Predict Result (Wrong because unrated rating is considered)

### Get Order Array

### Get Boolean Array where False for order < K and True for order >= K

### Set similarity term to 0 if order >= K

### Divide each row by sum of similarities

### Multiply TopK_item_similarity and train

## Calculate MSE

In [None]:
print('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

In [None]:
pred[0].sum()/np.count_nonzero(pred[0])

In [None]:
train[0].sum()/np.count_nonzero(train[0])