Collaborative Filtering
* Item-Item 
* Centered Cosine Similarity
* Top-k

https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

## Get Train and Test Set

Randomly choose 20% data as test set

In [1]:
import numpy as np

In [2]:
def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for book in range(ratings.shape[0]):
        nonzero_element = ratings[book, :].nonzero()[0]
        test_ratings = np.random.choice(nonzero_element, 
                                        size=int(len(nonzero_element) * 0.2), 
                                        replace=False)
        train[book, test_ratings] = 0
        test[book, test_ratings] = ratings[book, test_ratings]
        
    return train, test

## Center Matrix

In [3]:
def center_row(row):
    index = row.nonzero()[0]
    avg = row[index].mean()
    row[index] = row[index] - avg
    return row

## Calculate Similarity Matrix

In [4]:
def similarity(ratings, kind='item', center=True):
    if center:
        ratings_centered = ratings.copy()
        ratings_centered = np.apply_along_axis(center_row, 1, ratings_centered)
    else:
        ratings_centered = ratings
        
    if kind == 'item':
        sim = ratings.dot(ratings.T)
        
    norms = np.array([np.sqrt(np.diagonal(sim))])
    
    return (sim / norms / norms.T)

## Calculate MSE Error

In [5]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual, squared=False)

## Calculate Recall, Precision and Coverage

In [6]:
def get_recall_precision(prediction, actual):
    valid_prediction = prediction.copy()
    
    recall = 0
    precision = 0
    rec_zero = 0
    rel_zero = 0
    
    for pred, act in zip(valid_prediction, actual):
        pred[np.where(act == 0)] = 0
    
        recommend = pred[pred >= 4]
        relevant = pred[act >= 4]
        intersect = np.intersect1d(recommend, relevant)
        
        if (recommend.size == 0):
            rec_zero += 1
        else:
            recall += intersect.size / recommend.size
            
        if (relevant.size == 0):
            rel_zero += 1
        else:
            precision += intersect.size / relevant.size
        
    return recall / (prediction.shape[0] - rel_zero), precision / (prediction.shape[0] - rec_zero)

## Predict Result

In [7]:
def predict(item_similarity, train, K=40):
    pred = np.zeros(train.shape)
    for i in range(train.shape[0]):
        print(f'progress {i} {i / train.shape[0] * 100}%', end='\r')
        for j in range(train.shape[1]):
            # Already rated by user
            if (train[i][j] != 0):
                pred[i][j] = train[i][j]
                continue
            # Note rated by user
            user_nonzero = np.where(train[:, j] != 0)[0]
            similarity = item_similarity[i][user_nonzero]
            topK = np.argpartition(similarity, -K)[-K:]
            pred[i][j] = np.dot(similarity[topK], train[user_nonzero[topK], j]) / similarity[topK].sum()
    return pred