# 2-推荐算法

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white",palette=sns.color_palette("RdBu"))
import numpy as np
import pandas as pd
import scipy.io as sio

Notes:

X - num_movies (1682)  x num_features (10) matrix of movie features  

Theta - num_users (943)  x num_features (10) matrix of user features  

Y - num_movies x num_users matrix of user ratings of movies  

R - num_movies x num_users matrix, where R(i, j) = 1 if the i-th movie was rated by the j-th user  

In [2]:
data = sio.loadmat('./data/ex8_movies.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Thu Dec  1 17:19:26 2011',
 '__version__': '1.0',
 '__globals__': [],
 'Y': array([[5, 4, 0, ..., 5, 0, 0],
        [3, 0, 0, ..., 0, 0, 5],
        [4, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'R': array([[1, 1, 0, ..., 1, 0, 0],
        [1, 0, 0, ..., 0, 0, 1],
        [1, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}

In [3]:
Y = data['Y']
R = data['R']
Y.shape, R.shape

((1682, 943), (1682, 943))

In [4]:
def cost(params, Y, R, num_features):
    Y = np.matrix(Y)       #(1682, 943)
    R = np.matrix(R)       #(1682, 943)
    num_movies = Y.shape[0]
    num_users  = Y.shape[1]

    #reshape the parameter array into parameter matrices
    X = np.matrix(np.reshape(params[:num_movies * num_features], (num_movies, num_features))) #(1682, 10)
    Theta = np.matrix(np.reshape(params[num_movies *num_features :],(num_users, num_features))) #(943, 10)

    # initializations
    J = 0

    # compute the cost
    error = np.multiply((X * Theta.T) - Y, R) #(1682, 943)
    squared_error = np.power(error, 2)
    J = np.sum(squared_error) / 2

    return J

In [5]:
params_data = sio.loadmat('data/ex8_movieParams.mat')
params_data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [6]:
X = params_data['X']
Theta = params_data['Theta']
X.shape, Theta.shape

((1682, 10), (943, 10))

In [7]:
users = 4
movies = 5
features = 3

X_sub = X[:movies, :features]
Theta_sub = Theta[:users, :features]
Y_sub = Y[:movies, :users]
R_sub = R[:movies, :users]

params_sub = np.concatenate((np.ravel(X_sub), np.ravel(Theta_sub)))

cost(params_sub, Y_sub, R_sub, features)

22.224603725685675

In [8]:
def gradient(params, Y, R, num_features):
    Y = np.matrix(Y)  # (1682, 943)
    R = np.matrix(R)  # (1682, 943)
    num_movies = Y.shape[0]
    num_users = Y.shape[1]
    
    # reshape the parameter array into parameter matrices
    X = np.matrix(np.reshape(params[:num_movies * num_features], (num_movies, num_features)))  # (1682, 10)
    Theta = np.matrix(np.reshape(params[num_movies * num_features:], (num_users, num_features)))  # (943, 10)
    
    
    # compute the cost
    error = np.multiply((X * Theta.T) - Y, R)  # (1682, 943)

    #caculate the gradients
    X_grad = error * Theta
    Theta_grad = error.T * X

    grad = np.concatenate((np.ravel(X_grad),np.ravel(Theta_grad)))

    return  grad

In [9]:
grad = gradient(params_sub, Y_sub, R_sub, features)
grad    

array([ -2.52899165,   7.57570308,  -1.89979026,  -0.56819597,
         3.35265031,  -0.52339845,  -0.83240713,   4.91163297,
        -0.76677878,  -0.38358278,   2.26333698,  -0.35334048,
        -0.80378006,   4.74271842,  -0.74040871, -10.5680202 ,
         4.62776019,  -7.16004443,  -3.05099006,   1.16441367,
        -3.47410789,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ])

In [10]:
def regularized_cost(params, Y, R, num_features, learning_rate):
    J = cost(params, Y, R, num_features)
    # add the cost regularization
    J = J + np.power(params, 2).sum() * (learning_rate / 2)
    
    return J

In [11]:
def regularized_gradient(params, Y, R, num_features, learning_rate):
    grad = gradient(params, Y, R, num_features)

    return grad + learning_rate * params

In [12]:
J = regularized_cost(params_sub, Y_sub, R_sub, features, 1.5)
J

31.34405624427422

In [13]:
params = np.concatenate((np.ravel(X),np.ravel(Theta)))

In [14]:
J = regularized_cost(params, Y, R, 10, 1)
J

32520.682450229557

In [15]:
grad = regularized_gradient(params_sub, Y_sub, R_sub, features, 1.5)
grad

array([ -0.95596339,   6.97535514,  -0.10861109,   0.60308088,
         2.77421145,   0.25839822,   0.12985616,   4.0898522 ,
        -0.89247334,   0.29684395,   1.06300933,   0.66738144,
         0.60252677,   4.90185327,  -0.19747928, -10.13985478,
         2.10136256,  -6.76563628,  -2.29347024,   0.48244098,
        -2.99791422,  -0.64787484,  -0.71820673,   1.27006666,
         1.09289758,  -0.40784086,   0.49026541])

In [16]:
movie_idx = {}
f = open('data/movie_ids.txt',encoding='gbk')
for line in f: 
    tokens = line.split(' ')
    tokens[-1] = tokens[-1][:-1]
    movie_idx[int(tokens[0]) - 1] = ' '.join(tokens[1:])


In [17]:
movie_idx[0]

'Toy Story (1995)'

# 增加新用户评分

In [18]:
ratings = np.zeros(1682)

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5


In [19]:
Y, R = data.get('Y'), data.get('R')

In [20]:
Y = np.c_[ratings,  Y ]         #按列添加 column
R = np.c_[ratings!=0, R]

Y.shape, R.shape

((1682, 944), (1682, 944))

In [21]:
n_movie, n_user = Y.shape
n_features = 10
learning_rate = 10

#randn 标准正太分布随机
X = np.random.randn(n_movie, n_features)
Theta = np.random.randn(n_user, n_features)
params = np.r_[np.ravel(X), np.ravel(Theta)]

X.shape, Theta.shape, params.shape, type(X), type(Theta),type(params)

((1682, 10), (944, 10), (26260,), numpy.ndarray, numpy.ndarray, numpy.ndarray)

In [22]:
Ymean = np.zeros((n_movie, 1))
Ynorm = np.zeros((n_movie, n_user))

for i in range(n_movie):    
    idx = np.where(R[i,:]==1)
    Ymean[i] = Y[i, idx].mean()
    Ynorm[i, idx] = Y[i,idx] - Ymean[i]

Ynorm.mean()

5.462286541176089e-19

In [23]:
from scipy.optimize import minimize

fmin = minimize(fun=regularized_cost, x0=params, args=(Ynorm,R,n_features, learning_rate),
                method='TNC', jac=regularized_gradient)
fmin            

     fun: 38964.46785993085
     jac: array([ 4.06686362e-06,  1.98837351e-06, -1.48037759e-06, ...,
        1.05995924e-06, -1.44051972e-06,  3.54945951e-07])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 577
     nit: 37
  status: 1
 success: True
       x: array([-0.22289008,  0.79372413,  0.00525525, ..., -0.41602382,
        0.20161449,  0.48608752])

In [24]:
X = np.matrix(np.reshape(fmin.x[:n_movie * n_features], (n_movie, n_features)))
Theta = np.matrix(np.reshape(fmin.x[n_movie * n_features:], (n_user, n_features)))

X.shape, Theta.shape

((1682, 10), (944, 10))

In [25]:
predictions = X * Theta.T
my_preds = predictions[:, 0] + Ymean
my_preds.shape

(1682, 1)

In [26]:
idx = np.argsort(my_preds, axis=0)[::-1]  #降序排列
idx.shape

(1682, 1)

In [27]:
my_preds[idx][:10]

matrix([[[5.00000022]],

        [[5.00000003]],

        [[5.        ]],

        [[5.        ]],

        [[5.        ]],

        [[5.        ]],

        [[5.        ]],

        [[5.        ]],

        [[5.        ]],

        [[4.99999998]]])

In [28]:
idx = np.ravel(idx)
idx

array([1121,  813, 1598, ..., 1365, 1582, 1580], dtype=int64)

In [29]:
print("TOP 10 movie predictions:")
for m in idx[:10]:
     print('Predicted rating of {0} for movie {1}.'.format(str(float(my_preds[m])), movie_idx[m]))

TOP 10 movie predictions:
Predicted rating of 5.000000215397042 for movie They Made Me a Criminal (1939).
Predicted rating of 5.00000002512515 for movie Great Day in Harlem, A (1994).
Predicted rating of 5.00000000026316 for movie Someone Else's America (1995).
Predicted rating of 5.00000000015999 for movie Saint of Fort Washington, The (1993).
Predicted rating of 4.999999999997084 for movie Marlene Dietrich: Shadow and Light (1996) .
Predicted rating of 4.999999999963111 for movie Star Kid (1997).
Predicted rating of 4.999999999934065 for movie Prefontaine (1997).
Predicted rating of 4.999999999809731 for movie Santa with Muscles (1996).
Predicted rating of 4.999999997416422 for movie Entertaining Angels: The Dorothy Day Story (1996).
Predicted rating of 4.999999978137029 for movie Aiqing wansui (1994).
