In [None]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from mf import MatrixFactorization
import scipy.sparse
%matplotlib inline

In [None]:
def recall_at_k(k: int, topk: np.ndarray, actual: scipy.sparse.csc_matrix):
    return actual[:, topk].count_nonzero() / actual.count_nonzero()

def precision_at_k(k: int, topk: np.ndarray, actual: scipy.sparse.csc_matrix):
    return actual[:, topk].count_nonzero() / k

def dcg(rel: np.ndarray):
    if len(rel) < 1:
        return 0
    log2i = np.log2(np.asarray(range(1, len(rel) + 1)) + 1)
    return ((np.power(2, rel) - 1) / log2i).sum()
    
def ndcg_at_k(k: int, topk: np.ndarray, actual: scipy.sparse.csc_matrix):
    # retrieve relevant entries in topk. Non-relevant documents will get score 0.
    rel = actual[:, topk].toarray()[0]
    pad = max(0, k - len(rel))
     # pad could be zero in which case this will no-op
    rel = np.pad(rel, (0, pad), 'constant')
    _dcg = dcg(rel)
    _idcg = 0
    rel.sort()
    _idcg = dcg(rel[:-(k+1):-1])
    _ndcg = 0
    if _idcg > 0:
        _ndcg = _dcg / _idcg
    return _ndcg

In [None]:
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
x[:-122:-1]

In [None]:
type(_y[1, _y[1,:].nonzero()[1]].toarray())

In [None]:
x = scipy.sparse.load_npz('./data/train.npz')
y = scipy.sparse.load_npz('./data/test.npz')

In [None]:
M = MatrixFactorization(K=100, iterations=200) # after 200-250 iterations, test error increases

In [None]:
e_train, e_test = M.train(x, y)
plt.plot(e_train, label="train")
plt.plot(e_test, label="test")
plt.xlabel("iteration")
plt.ylabel("MSE")
plt.legend()
plt.show()
M.save("checkpoint.model")

In [None]:
_y = y.tocsc()

In [None]:
recall_scores = []
for i in tqdm(range(y.shape[0])):
    topk = M.recommend_sim(k=10, user=i)
    actual = _y[i]
    recall_scores.append(recall_at_k(k=10, topk=topk, actual=actual))
    precision_scores.append(precision_at_k(k=10, topk=topk, actual=actual))
    ndcg_scores.append(ndcg_at_k(k=10, topk=topk, actual=actual))

In [None]:
plt.figure(figsize=(8, 4.5))
plt.plot(sorted(recall_scores), label="recall")
plt.legend()
plt.show()

In [None]:
print(f'average recall: {np.mean(recall_scores)}')
print(f'average precision: {np.mean(precision_scores)}')
print(f'average ndcg: {np.mean(ndcg_scores)}')

In [None]:
recall_scores = []
precision_scores = []
ndcg_scores = []
for i in tqdm(range(y.shape[0])):
    topk = M.recommend(k=10, user=i)
    actual = _y[i]
    recall_scores.append(recall_at_k(k=10, topk=topk, actual=actual))
    precision_scores.append(precision_at_k(k=10, topk=topk, actual=actual))
    ndcg_scores.append(ndcg_at_k(k=10, topk=topk, actual=actual))

In [None]:
plt.figure(figsize=(8, 4.5))
plt.plot(recall_scores, label="recall")
plt.legend()
plt.show()

In [None]:
print(f'average recall: {np.mean(recall_scores)}')
print(f'average precision: {np.mean(precision_scores)}')
print(f'average ndcg: {np.mean(ndcg_scores)}')