# 第一步：收集数据
https://grouplens.org/datasets/movielens/

# 第二步：准备数据

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
movies_df = pd.read_csv('ml-latest-small/movies.csv')

In [3]:
ratings_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663
100003,671,6565,3.5,1074784724


In [4]:
movies_df.tail()

Unnamed: 0,movieId,title,genres
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary
9123,164977,The Gay Desperado (1936),Comedy
9124,164979,"Women of '69, Unboxed",Documentary


In [5]:
movies_df['movieRow'] = movies_df.index

In [6]:
movies_df.tail()

Unnamed: 0,movieId,title,genres,movieRow
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance,9120
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi,9121
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary,9122
9123,164977,The Gay Desperado (1936),Comedy,9123
9124,164979,"Women of '69, Unboxed",Documentary,9124


## 筛选 movies_df 中的特征

In [7]:
movies_df = movies_df[['movieRow', 'movieId', 'title']]
movies_df.to_csv('moviesProcessed.csv', index=False, header=True, encoding='utf-8')

In [8]:
movies_df.tail()

Unnamed: 0,movieRow,movieId,title
9120,9120,162672,Mohenjo Daro (2016)
9121,9121,163056,Shin Godzilla (2016)
9122,9122,163949,The Beatles: Eight Days a Week - The Touring Y...
9123,9123,164977,The Gay Desperado (1936)
9124,9124,164979,"Women of '69, Unboxed"


# 将 ratings_df 中的 movieId 替换为行号

In [9]:
ratings_df = pd.merge(ratings_df, movies_df, on='movieId')

In [10]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,movieRow,title
0,1,31,2.5,1260759144,30,Dangerous Minds (1995)
1,7,31,3.0,851868750,30,Dangerous Minds (1995)
2,31,31,4.0,1273541953,30,Dangerous Minds (1995)
3,32,31,4.0,834828440,30,Dangerous Minds (1995)
4,36,31,3.0,847057202,30,Dangerous Minds (1995)


In [11]:
ratings_df = ratings_df[['userId', 'movieRow', 'rating']]
ratings_df.to_csv('ratingsProcessed.csv', index = False, header=True, encoding='utf-8')

In [12]:
ratings_df.head()

Unnamed: 0,userId,movieRow,rating
0,1,30,2.5
1,7,30,3.0
2,31,30,4.0
3,32,30,4.0
4,36,30,3.0


## 创建电影评分矩阵 rating 和 评分记录矩阵 record

In [13]:
userNo = ratings_df['userId'].max()+1
movieNo = ratings_df['movieRow'].max()+1

In [14]:
userNo

672

In [15]:
movieNo

9123

In [16]:
rating = np.zeros((movieNo, userNo))

flag = 0
ratings_df_length = np.shape(ratings_df)[0]

for index, row in ratings_df.iterrows():
    rating[int(row['movieRow']), int(row['userId'])] = row['rating']
    flag += 1
    if flag % 5000 == 0:
        print('processed %d, %d left' % (flag, ratings_df_length-flag))

processed 5000, 95004 left
processed 10000, 90004 left
processed 15000, 85004 left
processed 20000, 80004 left
processed 25000, 75004 left
processed 30000, 70004 left
processed 35000, 65004 left
processed 40000, 60004 left
processed 45000, 55004 left
processed 50000, 50004 left
processed 55000, 45004 left
processed 60000, 40004 left
processed 65000, 35004 left
processed 70000, 30004 left
processed 75000, 25004 left
processed 80000, 20004 left
processed 85000, 15004 left
processed 90000, 10004 left
processed 95000, 5004 left
processed 100000, 4 left


In [17]:
record = rating>0

In [18]:
record

array([[False, False, False, ..., False,  True,  True],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [19]:
record = np.array(record, dtype=int)

In [20]:
record

array([[0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# 第三步：构建模型

In [21]:
def normalizeRatings(rating, record):
    m, n = rating.shape
    rating_mean = np.zeros((m, 1))
    rating_norm = np.zeros((m, n))
    for i in range(m):
        idx = record[i, :] !=0
        rating_mean[i] = np.mean(rating[i, idx])
        rating_norm[i, idx] -= rating_mean[i]
    return rating_norm, rating_mean

In [22]:
rating_norm, rating_mean = normalizeRatings(rating, record)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [23]:
rating_norm = np.nan_to_num(rating_norm)

In [24]:
rating_norm

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
        -3.87246964, -3.87246964],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [25]:
rating_mean = np.nan_to_num(rating_mean)

In [26]:
rating_mean

array([[3.87246964],
       [3.40186916],
       [3.16101695],
       ...,
       [3.        ],
       [0.        ],
       [5.        ]])

In [27]:
num_features = 10
X_parameters = tf.Variable(tf.random_normal([movieNo, num_features], stddev=0.35))
Theta_paramters = tf.Variable(tf.random_normal([userNo, num_features], stddev=0.35))
loss = 1/2 * tf.reduce_sum(((tf.matmul(X_parameters, Theta_paramters, transpose_b=True) - rating_norm)*record)**2) + \
    1/2 * (tf.reduce_sum(X_parameters**2) + tf.reduce_sum(Theta_paramters**2))
optimizer = tf.train.AdamOptimizer()
train = optimizer.minimize(loss)

# 第四步：训练模型

In [28]:
tf.summary.scalar('loss', loss)
summaryMerged = tf.summary.merge_all()
filename = './movie_tensorboard'
writer = tf.summary.FileWriter(filename)

In [29]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [30]:
penalty = movieNo*userNo

for i in range(3000):
    l, _, movie_summary = sess.run([loss, train, summaryMerged])
    if i%100 == 0:
        Current_X_parameters, Current_Theta_parameters = sess.run([X_parameters, Theta_paramters])
        predicts = np.dot(Current_X_parameters,Current_Theta_parameters.T) + rating_mean
        errors = np.mean((predicts - rating)**2)
        print('step:', i, ' train loss:%.5f' % (l/penalty), ' test loss:%.5f' % errors)
    writer.add_summary(movie_summary, i)

step: 0  train loss:0.10730  test loss:11.49665
step: 100  train loss:0.10049  test loss:11.35875
step: 200  train loss:0.07719  test loss:10.06480
step: 300  train loss:0.04130  test loss:7.07488
step: 400  train loss:0.01679  test loss:4.21624
step: 500  train loss:0.00734  test loss:2.57023
step: 600  train loss:0.00445  test loss:1.79445
step: 700  train loss:0.00351  test loss:1.41947
step: 800  train loss:0.00312  test loss:1.21572
step: 900  train loss:0.00290  test loss:1.09030
step: 1000  train loss:0.00275  test loss:1.00487
step: 1100  train loss:0.00262  test loss:0.94221
step: 1200  train loss:0.00252  test loss:0.89379
step: 1300  train loss:0.00242  test loss:0.85495
step: 1400  train loss:0.00234  test loss:0.82294
step: 1500  train loss:0.00226  test loss:0.79602
step: 1600  train loss:0.00219  test loss:0.77308
step: 1700  train loss:0.00213  test loss:0.75333
step: 1800  train loss:0.00207  test loss:0.73625
step: 1900  train loss:0.00201  test loss:0.72144
step: 200

训练完成之后，打开 cmd，切换到 tensorboard 保存数据的目录下，运行 tensorboard --logdir=./，然后在浏览器中输入 127.0.0.1:6006 即可看到图形化的训练结果。

# 第五步：评估模型

In [31]:
Current_X_parameters, Current_Theta_parameters = sess.run([X_parameters, Theta_paramters])
predicts = np.dot(Current_X_parameters,Current_Theta_parameters.T) + rating_mean
errors = np.mean((predicts - rating)**2)

In [32]:
errors

0.6500955952713876

# 第六步：构建完整的电影推荐系统

In [34]:
user_id = input('您要向哪位用户进行推荐？请输入用户编号：')

sortedResult = predicts[:, int(user_id)].argsort()[::-1]

idx = 0
print('为该用户推荐的评分最高的20部电影是：'.center(80, '='))
for i in sortedResult:
    print('评分：%.2f, 电影名：%s' % (predicts[i, int(user_id)], movies_df.iloc[i]['title']))
    idx += 1
    if idx == 20: break

您要向哪位用户进行推荐？请输入用户编号：666
评分：2.89, 电影名：The Biggest Fan (2002)
评分：2.88, 电影名：Dorian Blues (2004)
评分：2.87, 电影名：Latter Days (2003)
评分：2.87, 电影名：Long-Term Relationship (2006)
评分：2.87, 电影名：Three of Hearts (1993)
评分：2.87, 电影名：Back Soon (2007)
评分：2.86, 电影名：Zerophilia (2005)
评分：2.86, 电影名：Big Eden (2000)
评分：2.86, 电影名：FAQs (2005)
评分：2.86, 电影名：Curiosity of Chance, The (2006)
评分：2.86, 电影名：Mambo Italiano (2003)
评分：2.86, 电影名：On the Edge (2001)
评分：2.86, 电影名：Touch of Pink (2004)
评分：2.86, 电影名：Dancing in September (2000)
评分：2.86, 电影名：Almost Normal (2005)
评分：2.85, 电影名：I Think I Do (1997)
评分：2.85, 电影名：Clockstoppers (2002)
评分：2.85, 电影名：Trip, The (2002)
评分：2.85, 电影名：Shining Through (1992)
评分：2.85, 电影名：Shelter (2007)
