## 吴恩达机器学习习题八：推荐系统<br><br>
- 给用户推荐电影

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
from scipy.optimize import minimize

In [2]:
data = sio.loadmat('./data/ex8_movies.mat')
y = data['Y']  #(1682,943)
R = data['R']  #(1682,943)
pram_data = sio.loadmat('./data/ex8_movieParams.mat')
X = pram_data['X']  #(1682,10)
Theta = pram_data['Theta']  #(943,10)
N_u = pram_data['num_users']  #943个用户
N_m = pram_data['num_movies']  #1682部电影
N_f = pram_data['num_features']  #10个特征
N_u = int(N_u)
N_m = int(N_m)
N_f = int(N_f)
print(Theta.shape)

(943, 10)


In [3]:
#序列化权重参数
def serialize(X,Theta):
    
    return np.append(X.flatten(),Theta.flatten())

In [4]:
#解序列化权重参数
def deserialize(serialize1,N_m,N_u,N_f):
    
    X = serialize1[:N_m*N_f].reshape((N_m,N_f))
    Theta = serialize1[N_m*N_f:].reshape((N_u,N_f))
    return X,Theta

In [5]:
#代价函数
def cost_function(serialize1,y,R,N_m,N_u,N_f,lamda):
    
    X1,Theta1 = deserialize(serialize1,N_m,N_u,N_f)
    first = np.sum(np.power((X1@Theta1.T - y)* R,2))
    second = lamda*np.sum(np.power(X1,2))
    third = lamda*np.sum(np.power(Theta1,2))
    return 0.5*(first+second+third)

In [6]:
users = 4
movies = 5
features = 3
X_sub = X[:movies,:features]
Theta_sub = Theta[:users,:features]
Y_sub = y[:movies,:users]
R_sub = R[:movies,:users]
cost1 = cost_function(serialize(X_sub,Theta_sub),Y_sub,R_sub,movies,users,features,lamda = 0)
cost1

22.224603725685675

In [7]:
#梯度向量
def gradient(serialize1,y,R,N_m,N_u,N_f,lamda):
    
    X2,Theta2 = deserialize(serialize1,N_m,N_u,N_f)
    reg1 = ((X2@Theta2.T-y)*R)@Theta2 +lamda * X2
    reg2 = ((X2@Theta2.T-y)*R).T@X2 + lamda * Theta2
    return serialize(reg1,reg2)

In [8]:
#新增用户
my_ratings = np.zeros((N_m,1))
my_ratings[9]   = 5
my_ratings[66]  = 5
my_ratings[96]   = 5
my_ratings[121]  = 4
my_ratings[148]  = 4
my_ratings[285]  = 3
my_ratings[490]  = 4
my_ratings[599]  = 4
my_ratings[643] = 4
my_ratings[958] = 5
my_ratings[1117] = 3
y = np.c_[y,my_ratings]  #(1682,944)
R = np.c_[R,my_ratings!=0]  #(1682,944)
N_m = y.shape[0]
N_u = y.shape[1]

In [9]:
#均值归一化
def normalizeRatings(y,R):
    
    y_means =(y.sum(axis=1) / R.sum(axis=1)).reshape(-1,1)
    y_norm = (y - y_means) * R
    return y_norm,y_means

In [10]:
y_norm,y_means = normalizeRatings(y,R)

In [11]:
X = np.random.random((N_m,N_f))
Theta = np.random.random((N_u,N_f))
serialize1 = serialize(X,Theta)
lamda = 10

In [12]:
#优化函数
result = minimize(fun = cost_function,
        x0 = serialize1,
        args = (y_norm,R,N_m,N_u,N_f,lamda),
        method = 'TNC',
        jac = gradient,
        options = {'maxiter':100})

In [16]:
best_serialize = result.x
best_X,best_Theat = deserialize(best_serialize,N_m,N_u,N_f)
y_predict = best_X @ best_Theat.T 
y_predict = y_predict[:,-1] + y_means.flatten()  #(1682,)
index = np.argsort(-y_predict)  #最后一列为预测，将矩阵X排序（从大到小），并返回排序后的下标 
index[:10]  

(1682,)


array([1535, 1466, 1598,  813, 1652, 1200, 1188, 1292, 1121, 1499],
      dtype=int64)

In [14]:
movies = []
with open('data/movie_ids.txt','r',encoding='latin 1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movies.append(' '.join(tokens[1:]))

In [15]:
for i in range(10):
    print(index[i],movies[index[i]],y_predict[index[i]])  #index[i]返回列数

1535 Aiqing wansui (1994) 5.000885909802307
1466 Saint of Fort Washington, The (1993) 5.0007512315622815
1598 Someone Else's America (1995) 5.0004823725921135
813 Great Day in Harlem, A (1994) 5.000424759721229
1652 Entertaining Angels: The Dorothy Day Story (1996) 5.000331488391549
1200 Marlene Dietrich: Shadow and Light (1996) 5.000111681472238
1188 Prefontaine (1997) 5.000094411965996
1292 Star Kid (1997) 4.999997635636663
1121 They Made Me a Criminal (1939) 4.999890555928851
1499 Santa with Muscles (1996) 4.999646626028243
