In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat

In [144]:
path_xt = r"G:\AI学习资料\MachineLearningExercise\machine-learning-ex8\ex8\ex8_movieParams.mat"
path = r"G:\AI学习资料\MachineLearningExercise\machine-learning-ex8\ex8\ex8_movies.mat"
x_theta = loadmat(path_xt)
raw_data = loadmat(path)

In [145]:
X, Theta, Y, R = x_theta["X"], x_theta['Theta'], raw_data['Y'], raw_data['R']
[f'{item.shape}' for item in [X, Theta, Y, R]]

['(1682, 10)', '(943, 10)', '(1682, 943)', '(1682, 943)']

In [146]:
# Define a function 'ser_data' that takes two parameters 'X' and 'Theta'.
# This function flattens 'X' and 'Theta' and concatenates them horizontally.
# 'X' and 'Theta' are matrices.
ser_data = lambda X, Theta: np.hstack((X.ravel(), Theta.ravel()))

# Define a function 'cofiCostFunc' that calculates the cost and gradients for collaborative filtering.
# This function takes five parameters:
# 'X_Theta' is a flattened array that contains 'X' and 'Theta'.
# 'Y' is a matrix that contains user ratings for each movie.
# 'R' is a binary-valued indicator matrix, where R(i, j) = 1 if user j gave a rating to movie i, and R(i, j) = 0 otherwise.
# 'num_fea' is the number of features.
# 'lamda' is the regularization parameter.
def cofiCostFunc(X_Theta, Y, R, num_fea=10, lamda=1):
    # Split 'X_Theta' into 'X_r' and 'Theta_r'.
    X_r, Theta_r = np.hsplit(X_Theta, [Y.shape[0] * num_fea])
    # Reshape 'X_r' and 'Theta_r' into 'X' and 'Theta'.
    X = X_r.reshape(Y.shape[0], num_fea)
    Theta = Theta_r.reshape(Y.shape[1], num_fea)
    # Calculate the cost.
    cost = (1 / 2) * np.sum(
        (X @ Theta.T - Y)**2 * R) + (lamda / 2) * (np.sum(Theta**2) +
                                                   np.sum(X**2))
    # Calculate the gradient for 'Theta'.
    theta_gra = (Theta @ X.T - Y.T) * R.T @ X + lamda * Theta
    # Calculate the gradient for 'X'.
    x_gra = (X @ Theta.T - Y) * R @ Theta + lamda * X
    # Return the cost and the gradients.
    return cost, ser_data(x_gra.ravel(),theta_gra.ravel())

In [147]:
test = ser_data(X, Theta)
cofiCostFunc(test, Y, R)

(32520.682450229557,
 array([-5.21315594,  2.0591285 , -5.68148384, ..., -5.27650042,
         4.22109195,  2.11819114]))

#### parse `movie_id.txt`

In [148]:
movie_list = []
p = r"G:\AI学习资料\MachineLearningExercise\machine-learning-ex8\ex8\movie_ids.txt"
with open(p, encoding='latin-1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movie_list.append(' '.join(tokens[1:]))

movie_list = np.array(movie_list)

#### reproduce my ratings

In [149]:
ratings = np.zeros(1682)

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

#### prepare data

In [150]:
Y = np.insert(Y, 0, ratings, axis=1)  # now I become user 0
Y.shape

(1682, 944)

In [151]:
Y_norm = Y - Y.mean()

In [152]:
R = np.insert(R, 0, ratings != 0, axis=1)
R.shape

(1682, 944)

In [153]:
n_features = 50
n_movie, n_user = Y.shape
lamda = 10

In [154]:
X = np.random.standard_normal((n_movie, n_features))
theta = np.random.standard_normal((n_user, n_features))

X.shape, theta.shape

((1682, 50), (944, 50))

In [155]:
param = ser_data(X, theta)

In [156]:
import scipy.optimize as opt

In [157]:
res = opt.minimize(cofiCostFunc,x0=param,jac=True,args=(Y_norm,R,n_features,lamda),method='TNC')

In [158]:
res

     fun: 64721.49781509444
     jac: array([ 6.33839733e-07, -2.68648274e-06,  3.96286533e-06, ...,
       -1.02278222e-07, -3.24327682e-07,  6.57757555e-07])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 3506
     nit: 115
  status: 1
 success: True
       x: array([ 0.037558  , -0.08700081,  0.01610468, ..., -0.25265707,
        0.03572147, -0.65381112])

In [159]:
X_r, Theta_r = np.hsplit(res.x,[Y.shape[0]*n_features])
X = X_r.reshape(Y.shape[0], n_features)
Theta = Theta_r.reshape(Y.shape[1], n_features)

In [160]:
X.shape, Theta.shape

((1682, 50), (944, 50))

In [161]:
prediction = X @ Theta.T
my_preds = prediction[:, 0] + Y.mean()
idx = np.argsort(my_preds)[::-1]  # Descending order
idx.shape

(1682,)

In [162]:
# top ten idx
my_preds[idx][:10]

array([4.12532998, 4.04414791, 3.99324764, 3.91902345, 3.81692185,
       3.81555642, 3.76602306, 3.76323108, 3.75904202, 3.75076388])

In [163]:
for m in movie_list[idx][:10]:
    print(m)

Titanic (1997)
Star Wars (1977)
Shawshank Redemption, The (1994)
Forrest Gump (1994)
Raiders of the Lost Ark (1981)
Braveheart (1995)
Return of the Jedi (1983)
Usual Suspects, The (1995)
Godfather, The (1972)
Schindler's List (1993)
