In [2]:
import numpy as np
from scipy.sparse.linalg import lsmr

m, n = 10, 5
R_test = np.array([
    [5.0, 4.0, 4.0, np.nan, 5.0],
    [np.nan, 3.0, 5.0, 3.0, 4.0],
    [5.0, 2.0, np.nan, 2.0, 3.0],
    [np.nan, 2.0, 3.0, 1.0, 2.0],
    [4.0, np.nan, 5.0, 4.0, 5.0],
    [5.0, 3.0, np.nan, 3.0, 5.0],
    [3.0, 2.0, 3.0, 2.0, np.nan],
    [5.0, 3.0, 4.0, np.nan, 5.0],
    [4.0, 2.0, 5.0, 4.0, np.nan],
    [5.0, np.nan, 5.0, 3.0, 4.0]])


R_train = np.array([
    [5.0, 4.0, 4.0, np.nan, np.nan],
    [np.nan, 3.0, 5.0, np.nan, 4.0],
    [5.0, 2.0, np.nan, np.nan, 3.0],
    [np.nan, np.nan, 3.0, 1.0, 2.0],
    [4.0, np.nan, np.nan, 4.0, 5.0],
    [np.nan, 3.0, np.nan, 3.0, 5.0],
    [3.0, np.nan, 3.0, 2.0, np.nan],
    [5.0, np.nan, 4.0, np.nan, 5.0],
    [np.nan, 2.0, 5.0, 4.0, np.nan],
    [np.nan, np.nan, 5.0, 3.0, 4.0]])

mean = np.nanmean(R_test)
print(mean)

3.65


In [3]:
c = []
for j in range(n):
    for i in range(m):
        if not np.isnan(R_train[i][j]):
            c.append(R_train[i][j] - mean)
c = np.array(c)

In [4]:
A = np.zeros((len(c), m + n))
count = 0
for j in range(n):
    for i in range(m):
        if not np.isnan(R_train[i][j]):
            A[count][i] = 1
            A[count][m + j] = 1
            count += 1

# Baseline predictor (with regularization)

In [5]:
bu = lsmr(A, c, damp=1)[0][0:m]
bi = lsmr(A, c, damp=1)[0][m:]
print(bu)
print(bi)

[ 0.46061099  0.28519745 -0.2376656  -1.2960214   0.43111554  0.30016353
 -0.87060785  0.41614946  0.24844013  0.20397859]
[ 0.59173307 -0.88445778  0.50028151 -0.55958139  0.29338544]


In [6]:
# R_hat
R_hat = np.zeros((m, n))
for i in range(m):
    for j in range(n):
        R_hat[i][j] = mean + bu[i] + bi[j]
        if R_hat[i][j] >= 5:
            R_hat[i][j] = 5
        if R_hat[i][j] <= 1:
            R_hat[i][j] = 1
R_hat

array([[ 4.70234406,  3.22615321,  4.61089251,  3.5510296 ,  4.40399643],
       [ 4.52693052,  3.05073967,  4.43547896,  3.37561606,  4.22858289],
       [ 4.00406747,  2.52787662,  3.91261591,  2.85275301,  3.70571984],
       [ 2.94571167,  1.46952082,  2.85426012,  1.79439721,  2.64736404],
       [ 4.67284861,  3.19665776,  4.58139705,  3.52153415,  4.37450098],
       [ 4.5418966 ,  3.06570575,  4.45044505,  3.39058214,  4.24354897],
       [ 3.37112522,  1.89493437,  3.27967366,  2.21981076,  3.07277759],
       [ 4.65788253,  3.18169167,  4.56643097,  3.50656806,  4.35953489],
       [ 4.4901732 ,  3.01398235,  4.39872165,  3.33885874,  4.19182557],
       [ 4.44571166,  2.96952081,  4.35426011,  3.2943972 ,  4.14736403]])

In [7]:
# Training error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            error.append((R_train[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.56393304764012153

In [8]:
# Testing error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_test[i][j]) and np.isnan(R_train[i][j]):
            error.append((R_test[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.49815124768594571

# Neighborhood model (with regularization)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity as cs
def cosine_similarity(a, b):
    if a.shape != b.shape:
        return
    a_ = []
    b_ = []
    for i in range(len(a)):
        if not np.isnan(a[i]) and not np.isnan(b[i]):
            a_.append(a[i])
            b_.append(b[i])
    m = sum([a_[i] * b_[i] for i in range(len(a_))])
    n = (sum([i ** 2 for i in a_]) * sum([i ** 2 for i in b_])) ** 0.5
    return m / n

In [22]:
def find_neighbor(sims, i, threshould):
    return [(sims[i][j], j) for j in range(len(sims[i])) if abs(sims[i][j]) >= threshould]

In [12]:
R_tilda = np.zeros((m, n))
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            R_tilda[i][j] = R_train[i][j] - mean - bu[i] - bi[j]
R_tilda[R_tilda==0] = np.nan
R_tilda

array([[ 0.29765594,  0.77384679, -0.61089251,         nan,         nan],
       [        nan, -0.05073967,  0.56452104,         nan, -0.22858289],
       [ 0.99593253, -0.52787662,         nan,         nan, -0.70571984],
       [        nan,         nan,  0.14573988, -0.79439721, -0.64736404],
       [-0.67284861,         nan,         nan,  0.47846585,  0.62549902],
       [        nan, -0.06570575,         nan, -0.39058214,  0.75645103],
       [-0.37112522,         nan, -0.27967366, -0.21981076,         nan],
       [ 0.34211747,         nan, -0.56643097,         nan,  0.64046511],
       [        nan, -1.01398235,  0.60127835,  0.66114126,         nan],
       [        nan,         nan,  0.64573989, -0.2943972 , -0.14736403]])

In [13]:
movie_similarities = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        movie_similarities[i][j] = cosine_similarity(R_tilda[:, i], R_tilda[:, j])
movie_similarities

array([[ 1.        , -0.30336445, -0.52787214, -0.59406   , -0.63500842],
       [-0.30336445,  1.        , -0.84801683, -0.82628642,  0.59070433],
       [-0.52787214, -0.84801683,  1.        ,  0.14898616, -0.69054729],
       [-0.59406   , -0.82628642,  0.14898616,  1.        ,  0.45191931],
       [-0.63500842,  0.59070433, -0.69054729,  0.45191931,  1.        ]])

In [44]:
user_similarities = np.zeros((m, m))
for i in range(m):
    for j in range(m):
        user_similarities[i][j] = cosine_similarity(R_tilda[i, :], R_tilda[j, :])
user_similarities

array([[ 1.        , -0.68739652, -0.11989561, -1.        , -1.        ,
        -1.        ,  0.19121174,  0.99595697, -0.99116703, -1.        ],
       [-0.68739652,  1.        ,  0.91153905,  0.56972604, -1.        ,
        -0.95382402, -1.        , -0.89519492,  0.58500561,  0.98716847],
       [-0.11989561,  0.91153905,  1.        ,  1.        , -0.99123876,
        -0.74593327, -1.        , -0.12553501,  1.        ,  1.        ],
       [-1.        ,  0.56972604,  1.        ,  1.        , -0.972738  ,
        -0.20565976,  0.46592141, -0.87628689, -0.60625246,  0.56431635],
       [-1.        , -1.        , -0.99123876, -0.972738  ,  1.        ,
         0.42700255,  0.40587059,  0.25547255,  1.        , -0.89882792],
       [-1.        , -0.95382402, -0.74593327, -0.20565976,  0.42700255,
         1.        ,  1.        ,  1.        , -0.39964758,  0.01253267],
       [ 0.19121174, -1.        , -1.        ,  0.46592141,  0.40587059,
         1.        ,  1.        ,  0.10226455

In [67]:
# R_hat using movie similarities
R_hat = np.zeros((m, n))
L = 0.95
for i in range(m):
    for j in range(n):
        R_hat[i][j] += mean + bu[i] + bi[j]
        neighbors = find_neighbor(movie_similarities, j, L)
        a = 0
        b = 0
        for k in range(len(neighbors)):
            if not np.isnan(R_tilda[i][neighbors[k][1]]):
                a += neighbors[k][0] * R_tilda[i][neighbors[k][1]]
                b += abs(neighbors[k][0])
        if a != 0 and b != 0:
            R_hat[i][j] += a / b
        if R_hat[i][j] <= 1:
            R_hat[i][j] = 1 
        if R_hat[i][j] >= 5:
                R_hat[i][j] = 5
R_hat

array([[ 5.        ,  4.        ,  4.        ,  3.5510296 ,  4.40399643],
       [ 4.52693052,  3.        ,  5.        ,  3.37561606,  4.        ],
       [ 5.        ,  2.        ,  3.91261591,  2.85275301,  3.        ],
       [ 2.94571167,  1.46952082,  3.        ,  1.        ,  2.        ],
       [ 4.        ,  3.19665776,  4.58139705,  4.        ,  5.        ],
       [ 4.5418966 ,  3.        ,  4.45044505,  3.        ,  5.        ],
       [ 3.        ,  1.89493437,  3.        ,  2.        ,  3.07277759],
       [ 5.        ,  3.18169167,  4.        ,  3.50656806,  5.        ],
       [ 4.4901732 ,  2.        ,  5.        ,  4.        ,  4.19182557],
       [ 4.44571166,  2.96952081,  5.        ,  3.        ,  4.        ]])

In [68]:
# Training error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            error.append((R_train[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

3.1401849173675503e-16

In [69]:
# Testing error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_test[i][j]) and np.isnan(R_train[i][j]):
            error.append((R_test[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.49815124768594571

In [96]:
# R_hat using user similarities
R_hat = np.zeros((m, n))
L = 0.95
for i in range(m):
    for j in range(n):
        R_hat[i][j] += mean + bu[i] + bi[j]
        neighbors = find_neighbor(user_similarities, j, L)
        a = 0
        b = 0
        for k in range(len(neighbors)):
            if not np.isnan(R_tilda[neighbors[k][1]][j]):
                a += neighbors[k][0] * R_tilda[neighbors[k][1]][j]
                b += abs(neighbors[k][0])
        if a != 0 and b != 0:
            R_hat[i][j] += a / b
        if R_hat[i][j] <= 1:
            R_hat[i][j] = 1 
        if R_hat[i][j] >= 5:
                R_hat[i][j] = 5
R_hat

array([[ 5.        ,  3.23226024,  5.        ,  2.91241509,  4.95479037],
       [ 4.96459997,  3.0568467 ,  4.85358691,  2.73700154,  4.77937683],
       [ 4.44173692,  2.53398365,  4.33072386,  2.21413849,  4.25651378],
       [ 3.38338112,  1.47562785,  3.27236806,  1.1557827 ,  3.19815798],
       [ 5.        ,  3.20276479,  4.999505  ,  2.88291963,  4.92529492],
       [ 4.97956605,  3.07181278,  4.86855299,  2.75196763,  4.79434291],
       [ 3.80879466,  1.90104139,  3.69778161,  1.58119624,  3.62357152],
       [ 5.        ,  3.1877987 ,  4.98453892,  2.86795355,  4.91032883],
       [ 4.92784265,  3.02008938,  4.81682959,  2.70024423,  4.74261951],
       [ 4.88338111,  2.97562784,  4.77236805,  2.65578269,  4.69815797]])

In [97]:
# Training error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            error.append((R_train[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.68334052283219149

In [98]:
# Testing error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_test[i][j]) and np.isnan(R_train[i][j]):
            error.append((R_test[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.36221601889657457