# 推荐系统
## 案例：给用户推荐电影
### 数据集：ex8_movies.mat ex8_movieParams.mat

In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import scipy.io as sio

In [2]:
mat = sio.loadmat('ex8_movies.mat')
mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])

In [3]:
Y,R = mat['Y'],mat['R']
Y.shape,R.shape

((1682, 943), (1682, 943))

In [4]:
pmat = sio.loadmat('ex8_movieParams.mat')
pmat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [5]:
X,Theta,nu,nm,nf = pmat['X'],pmat['Theta'],pmat['num_users'],pmat['num_movies'],pmat['num_features']
X.shape,Theta.shape,nu,nm,nf

((1682, 10),
 (943, 10),
 array([[943]], dtype=uint16),
 array([[1682]], dtype=uint16),
 array([[10]], dtype=uint8))

In [6]:
nu = int(nu)
nm = int(nm)
nf = int(nf)
nu,nm,nf

(943, 1682, 10)

### 1.序列化参数

In [7]:
def serialize(X,Theta):
    return np.append(X.flatten(),Theta.flatten())

### 2.解序列化参数

In [8]:
def deserialize(params,nu,nm,nf):
    X = params[:nm*nf].reshape(nm,nf)
    Theta = params[nm*nf:].reshape(nu,nf)
    return X,Theta

### 3.代价函数

In [9]:
def costFunction(params,Y,R,nm,nu,nf,lamda):
    X,Theta = deserialize(params,nu,nm,nf)
    error = 0.5 * np.square((X @ Theta.T - Y) * R).sum()
    reg1 = 0.5 * lamda * np.square(X).sum()
    reg2 = 0.5 * lamda * np.square(Theta).sum()
    
    return error+reg1+reg2

### 4.梯度

In [10]:
def costGradient(params,Y,R,nm,nu,nf,lamda):
    X,Theta = deserialize(params,nu,nm,nf)
    X_grad = ((X @ Theta.T - Y) * R) @ Theta + lamda * X
    Theta_grad = ((X @ Theta.T - Y) * R).T @ X + lamda * Theta
    return serialize(X_grad,Theta_grad)

### 5.添加一个新用户

In [11]:
my_ratings = np.zeros((nm,1))
my_ratings[9] = 5
my_ratings[66] = 5
my_ratings[96] = 5
my_ratings[121] = 4
my_ratings[148] = 4
my_ratings[285] = 3
my_ratings[490] = 4
my_ratings[599] = 4
my_ratings[643] = 4
my_ratings[958] = 5
my_ratings[1117] = 3

In [12]:
Y = np.c_[Y,my_ratings]
R = np.c_[R,my_ratings != 0]

In [13]:
Y.shape

(1682, 944)

In [14]:
nm,nu = Y.shape

### 6.均值归一化

In [15]:
def normalizeRatings(Y,R):
    Y_mean = (Y.sum(axis=1) / R.sum(axis=1)).reshape(-1,1)
    Y_norm = (Y - Y_mean) * R
    return Y_norm,Y_mean

In [16]:
Y_norm,Y_mean = normalizeRatings(Y,R)

### 7.参数初始化

In [17]:
X = np.random.random((nm,nf))
Theta = np.random.random((nu,nf))
params = serialize(X,Theta)
lamda = 5

### 8.模型训练

In [18]:
from scipy.optimize import minimize
res = minimize(fun=costFunction,
               x0 = params,
               args = (Y_norm,R,nm,nu,nf,lamda),
               method = 'TNC',
               jac = costGradient,
               options = {'maxiter':100})

In [19]:
params_fit = res.x

In [22]:
fit_X,fit_Theta = deserialize(params_fit,nu,nm,nf)

### 9.预测

In [23]:
Y_pred = fit_X @ fit_Theta.T
Y_pred.shape

(1682, 944)

In [24]:
y_pred = Y_pred[:,-1] + Y_mean.flatten()

In [25]:
index = np.argsort(-y_pred)
index[:10]

array([1466, 1652, 1598, 1499, 1535,  813, 1200, 1121, 1188, 1292],
      dtype=int64)

In [27]:
movies = []
with open('movie_ids.txt','r',encoding='latin1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movies.append(''.join(tokens[1:]))

In [28]:
len(movies)

1682

In [29]:
for i in range(10):
    print(index[i],movies[index[i]],y_pred[index[i]])

1466 SaintofFortWashington,The(1993) 5.0471633565295955
1652 EntertainingAngels:TheDorothyDayStory(1996) 5.03925961594295
1598 SomeoneElse'sAmerica(1995) 5.036708264024702
1499 SantawithMuscles(1996) 5.029392894144675
1535 Aiqingwansui(1994) 5.024259333506892
813 GreatDayinHarlem,A(1994) 5.023898868028925
1200 MarleneDietrich:ShadowandLight(1996) 5.016495085105815
1121 TheyMadeMeaCriminal(1939) 5.016141947312591
1188 Prefontaine(1997) 5.005998500784098
1292 StarKid(1997) 4.990412235173138
