In [72]:
import numpy as np
m, n = 10, 5
R_test = np.array([
    [5.0, 4.0, 4.0, np.nan, 5.0],
    [np.nan, 3.0, 5.0, 3.0, 4.0],
    [5.0, 2.0, np.nan, 2.0, 3.0],
    [np.nan, 2.0, 3.0, 1.0, 2.0],
    [4.0, np.nan, 5.0, 4.0, 5.0],
    [5.0, 3.0, np.nan, 3.0, 5.0],
    [3.0, 2.0, 3.0, 2.0, np.nan],
    [5.0, 3.0, 4.0, np.nan, 5.0],
    [4.0, 2.0, 5.0, 4.0, np.nan],
    [5.0, np.nan, 5.0, 3.0, 4.0]])


R_train = np.array([
    [5.0, 4.0, 4.0, np.nan, np.nan],
    [np.nan, 3.0, 5.0, np.nan, 4.0],
    [5.0, 2.0, np.nan, np.nan, 3.0],
    [np.nan, np.nan, 3.0, 1.0, 2.0],
    [4.0, np.nan, np.nan, 4.0, 5.0],
    [np.nan, 3.0, np.nan, 3.0, 5.0],
    [3.0, np.nan, 3.0, 2.0, np.nan],
    [5.0, np.nan, 4.0, np.nan, 5.0],
    [np.nan, 2.0, 5.0, 4.0, np.nan],
    [np.nan, np.nan, 5.0, 3.0, 4.0]])

mean = np.nanmean(R_test)
print(mean)

3.65


In [73]:
c = []
for j in range(n):
    for i in range(m):
        if not np.isnan(R_train[i][j]):
            c.append(R_train[i][j] - mean )
c = np.array(c)

In [74]:
A = np.zeros((len(c), m + n))
count = 0
for j in range(n):
    for i in range(m):
        if not np.isnan(R_train[i][j]):
            A[count][i] = 1
            A[count][m + j] = 1
            count += 1

# Baseline predictor (without regularization)

In [75]:
import scipy
bu = np.linalg.lstsq(A, c)[0][0:m]
bi = np.linalg.lstsq(A, c)[0][m:]
print(bu)
print(bi)

[ 0.63148148  0.4290404  -0.27592593 -1.76919192  0.52584175  0.49806397
 -1.23341751  0.45681818  0.40547138  0.23080808]
[ 0.7290404  -1.18762626  0.61414141 -0.59292929  0.33636364]


In [76]:
# R_hat
R_hat = np.zeros((m, n))
for i in range(m):
    for j in range(n):
        R_hat[i][j] = mean + bu[i] + bi[j]
        if R_hat[i][j] >= 5:
            R_hat[i][j] = 5
        if R_hat[i][j] <= 1:
            R_hat[i][j] = 1
R_hat

array([[ 5.        ,  3.09385522,  4.8956229 ,  3.68855219,  4.61784512],
       [ 4.80808081,  2.89141414,  4.69318182,  3.48611111,  4.41540404],
       [ 4.10311448,  2.18644781,  3.98821549,  2.78114478,  3.71043771],
       [ 2.60984848,  1.        ,  2.49494949,  1.28787879,  2.21717172],
       [ 4.90488215,  2.98821549,  4.78998316,  3.58291246,  4.51220539],
       [ 4.87710438,  2.96043771,  4.76220539,  3.55513468,  4.48442761],
       [ 3.1456229 ,  1.22895623,  3.03072391,  1.8236532 ,  2.75294613],
       [ 4.83585859,  2.91919192,  4.7209596 ,  3.51388889,  4.44318182],
       [ 4.78451178,  2.86784512,  4.66961279,  3.46254209,  4.39183502],
       [ 4.60984848,  2.69318182,  4.49494949,  3.28787879,  4.21717172]])

In [77]:
# Training error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            error.append((R_train[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.51261768724663193

In [78]:
# Testing error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_test[i][j]):
            error.append((R_test[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.53159585510217144

# Neighborhood model (without regularization)

In [79]:
from sklearn.metrics.pairwise import cosine_similarity as cs
def cosine_similarity(a, b):
    if a.shape != b.shape:
        return
    a_ = []
    b_ = []
    for i in range(len(a)):
        if not np.isnan(a[i]) and not np.isnan(b[i]):
            a_.append(a[i])
            b_.append(b[i])
    m = sum([a_[i] * b_[i] for i in range(len(a_))])
    n = (sum([i ** 2 for i in a_]) * sum([i ** 2 for i in b_])) ** 0.5
    return m / n

def find_neighbor(movie_similarities, i, L):
    tmp = [(movie_similarities[i][j], j) for j in range(len(movie_similarities[i])) if j != i]
    tmp.sort(reverse=True, key=lambda x: abs(x[0]))
    return tmp[:L]

In [80]:
R_tilda = np.zeros((m, n))
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            R_tilda[i][j] = R_train[i][j] - mean - bu[i] - bi[j]
R_tilda[R_tilda==0] = np.nan
R_tilda

array([[-0.01052189,  0.90614478, -0.8956229 ,         nan,         nan],
       [        nan,  0.10858586,  0.30681818,         nan, -0.41540404],
       [ 0.89688552, -0.18644781,         nan,         nan, -0.71043771],
       [        nan,         nan,  0.50505051, -0.28787879, -0.21717172],
       [-0.90488215,         nan,         nan,  0.41708754,  0.48779461],
       [        nan,  0.03956229,         nan, -0.55513468,  0.51557239],
       [-0.1456229 ,         nan, -0.03072391,  0.1763468 ,         nan],
       [ 0.16414141,         nan, -0.7209596 ,         nan,  0.55681818],
       [        nan, -0.86784512,  0.33038721,  0.53745791,         nan],
       [        nan,         nan,  0.50505051, -0.28787879, -0.21717172]])

In [81]:
movie_similarities = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        movie_similarities[i][j] = cosine_similarity(R_tilda[:, i], R_tilda[:, j])
movie_similarities

array([[ 1.        , -0.21301359, -0.41335723, -0.97123137, -0.74899568],
       [-0.21301359,  1.        , -0.84334288, -0.72757202,  0.505802  ],
       [-0.41335723, -0.84334288,  1.        , -0.21614403, -0.92916674],
       [-0.97123137, -0.72757202, -0.21614403,  1.        ,  0.06791977],
       [-0.74899568,  0.505802  , -0.92916674,  0.06791977,  1.        ]])

In [82]:
# R_hat
R_hat = np.zeros((m, n))
L = 2
for i in range(m):
    for j in range(n):
        R_hat[i][j] += mean + bu[i] + bi[j]
        neighbors = find_neighbor(movie_similarities, j, L)
        a = 0
        b = 0
        for k in range(len(neighbors)):
            if not np.isnan(R_tilda[i][neighbors[k][1]]):
                a += neighbors[k][0] * R_tilda[i][neighbors[k][1]]
                b += abs(neighbors[k][0])
        if a != 0 and b != 0:
            R_hat[i][j] += a / b
        if R_hat[i][j] <= 1:
            R_hat[i][j] = 1 
        if R_hat[i][j] >= 5:
                R_hat[i][j] = 5
R_hat

array([[ 5.        ,  3.98947811,  3.98947811,  3.30647949,  5.        ],
       [ 5.        ,  2.58459596,  4.85927655,  3.37752525,  4.10858586],
       [ 4.81355219,  2.18644781,  4.44934389,  2.34823468,  2.81355219],
       [ 2.86694105,  1.        ,  2.71212121,  1.28787879,  1.71212121],
       [ 4.45700839,  2.57112795,  4.30218855,  4.48779461,  5.        ],
       [ 4.96604771,  3.51557239,  4.47311398,  3.51557239,  4.48442761],
       [ 2.96927609,  1.16377497,  3.03072391,  1.96927609,  2.83495163],
       [ 4.2790404 ,  3.64015152,  4.16414141,  3.34974747,  4.76910407],
       [ 4.24705387,  2.44155275,  5.        ,  4.33038721,  4.06144781],
       [ 4.86694105,  2.55537792,  4.71212121,  3.28787879,  3.71212121]])

In [83]:
# Training error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            error.append((R_train[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.31396118989196564

In [84]:
# Testing error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_test[i][j]):
            error.append((R_test[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.38395448005987165

# Baseline predictor (with regularization)

In [85]:
from scipy.sparse.linalg import lsmr
bu = lsmr(A, c, damp=0.7)[0][0:m]
bi = lsmr(A, c, damp=0.7)[0][m:]
print(bu)
print(bi)

[ 0.53279828  0.3434698  -0.25882283 -1.49968523  0.47681866  0.38106955
 -1.02382381  0.43921891  0.31309182  0.21951245]
[ 0.65278932 -1.0130432   0.5507899  -0.58042937  0.31354094]


In [86]:
# R_hat
R_hat = np.zeros((m, n))
for i in range(m):
    for j in range(n):
        R_hat[i][j] = mean + bu[i] + bi[j]
        if R_hat[i][j] >= 5:
            R_hat[i][j] = 5
        if R_hat[i][j] <= 1:
            R_hat[i][j] = 1
R_hat

array([[ 4.8355876 ,  3.16975508,  4.73358818,  3.6023689 ,  4.49633921],
       [ 4.64625912,  2.9804266 ,  4.54425971,  3.41304043,  4.30701074],
       [ 4.0439665 ,  2.37813398,  3.94196708,  2.8107478 ,  3.70471811],
       [ 2.80310409,  1.13727157,  2.70110467,  1.56988539,  2.4638557 ],
       [ 4.77960798,  3.11377547,  4.67760857,  3.54638929,  4.4403596 ],
       [ 4.68385888,  3.01802636,  4.58185946,  3.45064018,  4.34461049],
       [ 3.27896551,  1.61313299,  3.17696609,  2.04574682,  2.93971713],
       [ 4.74200823,  3.07617571,  4.64000881,  3.50878954,  4.40275985],
       [ 4.61588114,  2.95004862,  4.51388172,  3.38266244,  4.27663275],
       [ 4.52230177,  2.85646925,  4.42030235,  3.28908307,  4.18305339]])

In [87]:
# Training error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            error.append((R_train[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.52967381128831903

In [88]:
# Testing error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_test[i][j]):
            error.append((R_test[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.5293716736950409

# Neighborhood model (with regularization)

In [89]:
from sklearn.metrics.pairwise import cosine_similarity as cs
def cosine_similarity(a, b):
    if a.shape != b.shape:
        return
    a_ = []
    b_ = []
    for i in range(len(a)):
        if not np.isnan(a[i]) and not np.isnan(b[i]):
            a_.append(a[i])
            b_.append(b[i])
    m = sum([a_[i] * b_[i] for i in range(len(a_))])
    n = (sum([i ** 2 for i in a_]) * sum([i ** 2 for i in b_])) ** 0.5
    return m / n

def find_neighbor(movie_similarities, i, L):
    tmp = [(movie_similarities[i][j], j) for j in range(len(movie_similarities[i])) if j != i]
    tmp.sort(reverse=True, key=lambda x: abs(x[0]))
    return tmp[:L]

In [90]:
R_tilda = np.zeros((m, n))
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            R_tilda[i][j] = R_train[i][j] - mean - bu[i] - bi[j]
R_tilda[R_tilda==0] = np.nan
R_tilda

movie_similarities = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        movie_similarities[i][j] = cosine_similarity(R_tilda[:, i], R_tilda[:, j])
movie_similarities

array([[ 1.        , -0.25424624, -0.57695792, -0.90297969, -0.70229018],
       [-0.25424624,  1.        , -0.84920421, -0.79636666,  0.6493644 ],
       [-0.57695792, -0.84920421,  1.        , -0.04013037, -0.89791815],
       [-0.90297969, -0.79636666, -0.04013037,  1.        ,  0.30638918],
       [-0.70229018,  0.6493644 , -0.89791815,  0.30638918,  1.        ]])

In [91]:
# R_hat
R_hat = np.zeros((m, n))
L = 2
for i in range(m):
    for j in range(n):
        R_hat[i][j] += mean + bu[i] + bi[j]
        neighbors = find_neighbor(movie_similarities, j, L)
        a = 0
        b = 0
        for k in range(len(neighbors)):
            if not np.isnan(R_tilda[i][neighbors[k][1]]):
                a += neighbors[k][0] * R_tilda[i][neighbors[k][1]]
                b += abs(neighbors[k][0])
        if a != 0 and b != 0:
            R_hat[i][j] += a / b
        if R_hat[i][j] <= 1:
            R_hat[i][j] = 1 
        if R_hat[i][j] >= 5:
                R_hat[i][j] = 5
R_hat

array([[ 4.8355876 ,  3.90334326,  3.90334326,  3.12592664,  4.83581809],
       [ 4.95326986,  2.52468631,  4.69253135,  3.39346703,  3.85127044],
       [ 4.74868461,  2.37813398,  4.4879461 ,  2.47994676,  2.74868461],
       [ 3.32660251,  1.25881878,  3.16496037,  1.56988539,  2.16496037],
       [ 4.2796103 ,  2.66016475,  4.11796817,  4.32599727,  5.        ],
       [ 4.65062167,  3.46866654,  4.25378966,  3.46866654,  4.34461049],
       [ 3.32471233,  1.7265961 ,  3.17696609,  2.32471233,  3.16144814],
       [ 4.14476808,  3.71618453,  4.04276866,  3.25079777,  4.64865936],
       [ 3.99854358,  2.40042736,  5.        ,  4.33271106,  3.79051447],
       [ 4.76499787,  2.69721415,  4.60335574,  3.28908307,  3.60335574]])

In [92]:
# Training error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_train[i][j]):
            error.append((R_train[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.35861901465212359

In [93]:
# Testing error
error = []
for i in range(m):
    for j in range(n):
        if not np.isnan(R_test[i][j]):
            error.append((R_test[i][j] - R_hat[i][j]) ** 2)
np.sqrt(np.mean(error))

0.39883085002588275