In [1]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

## 数据准备

In [2]:
from dataset.dataset import load_ml

batch_size = 512
train_data, test_data = load_ml(batch_size)

(800167, 7) (800167,)
(200042, 7) (200042,)


## 模型概述

In [3]:
import numpy as np

# 用户部分
u_id_size = len(np.unique(train_data.u_id))
u_id_emb_size = 32
u_occu_size = len(np.unique(train_data.u_occu))
u_occu_emb_size = 8
u_agegen_size = len(np.unique(train_data.u_age_gender))
u_agegen_emb_size = 8

# 电影部分
m_id_size = len(np.unique(train_data.m_id))
m_id_emb_size = 32
m_voc_size = 4308
m_tit_emb_size = 16
m_gen_size = 19
m_gen_emb_size = 8
m_year_size = 81

m_tit_size = len(train_data.m_title[0])
m_gens_size = len(train_data.m_genres[0])

# 文本卷积网络参数
n_txt_cnn_kernels = (8, 4, 2)
text_cnn_ksize = (1, 2, 3)

# FC层
unit_fc1 = 1    # 电影年份是一个单独标量，同样经过一层FC
unit_fc2 = 16    # 各embedding特征经过的FC层
unit_fc3 = 128    # 单端特征的联合FC
unit_O = 1    # 输出一个分数

## 搭建模型
首先是输入：

In [4]:
import tensorflow as tf

u_id = tf.placeholder(tf.int32, [None], name='u_id')
u_agegen = tf.placeholder(tf.int32, [None], name='u_agegen')
u_occu = tf.placeholder(tf.int32, [None], name='u_occu')

m_id = tf.placeholder(tf.int32, [None], name='m_id')
m_tit = tf.placeholder(tf.int32, [None, m_tit_size], name='m_tit')
m_gen = tf.placeholder(tf.int32, [None, m_gens_size], name='n_gen')
m_year = tf.placeholder(tf.int32, [None], name='m_year')

Y = tf.placeholder(tf.float32, [None, 1], name='rating')

is_training = tf.placeholder(tf.bool)    # 训练标识位

  from ._conv import register_converters as _register_converters


然后是嵌入部分：

In [5]:
with tf.variable_scope('user_embedding', initializer=tf.random_uniform_initializer(-1.0, 1.0)):
    # u_id嵌入
    uid_emb_lookup = tf.get_variable('uid_embedding', [u_id_size, u_id_emb_size],
                                     dtype=tf.float32)
    uid_emb = tf.nn.embedding_lookup(uid_emb_lookup, u_id)

    # u_occu嵌入
    uoccu_emb_lookup = tf.get_variable('uoccu_embedding', [u_occu_size, u_occu_emb_size],
                                       dtype=tf.float32)
    uoccu_emb = tf.nn.embedding_lookup(uoccu_emb_lookup, u_occu)

    # u_age_gender嵌入
    uagegen_emb = tf.get_variable('uagegen_embedding', [u_agegen_size, u_agegen_emb_size],
                                  dtype=tf.float32)
    uagegen_emb = tf.nn.embedding_lookup(uagegen_emb, u_agegen)

with tf.variable_scope('movie_embedding', initializer=tf.random_uniform_initializer(-1.0, 1.0)):
    mid_emb_lookup = tf.get_variable('mid_embedding', [m_id_size, m_id_emb_size],
                                     dtype=tf.float32)
    mid_emb = tf.nn.embedding_lookup(mid_emb_lookup, m_id)

    mtit_emb_lookup = tf.get_variable('mtit_embedding', [m_voc_size, m_tit_emb_size],
                                      dtype=tf.float32)
    mtit_emb = tf.nn.embedding_lookup(mtit_emb_lookup, m_tit)

    mgen_emb_lookup = tf.get_variable('mgen_embedding', [m_gen_size, m_gen_emb_size],
                                      dtype=tf.float32)
    mgen_emb = tf.nn.embedding_lookup(mgen_emb_lookup, m_gen)
    mgen_emb = tf.reduce_mean(mgen_emb, axis=1)    # 查找得到的多重emb做平均

Instructions for updating:
Colocations handled automatically by placer.


对电影标题应用TextCNN：

In [6]:
with tf.name_scope('TextCNN'):
    # 在最后增加一维，扩成四维向量(batch_size,tit_len,m_tit_emb_size,1)
    mtit_emb_exp = tf.expand_dims(mtit_emb, -1)

    layers = list()
    for i in range(len(n_txt_cnn_kernels)):
        conv = tf.layers.conv2d(mtit_emb_exp, filters=n_txt_cnn_kernels[i],
                                kernel_size=(text_cnn_ksize[i],
                                             m_tit_emb_size),
                                padding='same', activation=tf.nn.relu)
        pool = tf.layers.max_pooling2d(conv, pool_size=(2, 1),
                                       strides=(1, 1))
        layers.append(pool)

    tit_pool = tf.concat(layers, axis=3)
    tit_dropout = tf.layers.dropout(tf.layers.flatten(tit_pool),
                                    rate=0.2, training=is_training)

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Use keras.layers.max_pooling2d instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


接下来是全连接层，同样分为user与movie两部分：

In [7]:
with tf.name_scope('user_fc'):
    uid_fc = tf.layers.dense(uid_emb, unit_fc2, activation=tf.nn.relu,
                             name='uid_fc')
    uoccu_fc = tf.layers.dense(uoccu_emb, unit_fc2, activation=tf.nn.relu,
                               name='uoccu_fc')
    uagegen_fc = tf.layers.dense(uagegen_emb, unit_fc2, activation=tf.nn.relu,
                                 name='uagegen_fc')

    user_fc = tf.concat([uid_fc, uoccu_fc, uagegen_fc], axis=1)
    user_fc = tf.layers.dense(user_fc, unit_fc3, activation=tf.nn.relu)
    user_fc = tf.layers.dropout(user_fc, rate=0.3, training=is_training,
                                name='user_fc')

with tf.name_scope('movie_fc'):
    mid_fc = tf.layers.dense(mid_emb, unit_fc2, activation=tf.nn.relu)
    mgen_fc = tf.layers.dense(mgen_emb, unit_fc2, activation=tf.nn.relu)
    mtit_fc = tf.layers.dense(tit_dropout, unit_fc2, activation=tf.nn.relu)
    myear_fc = tf.layers.dense(tf.reshape(m_year, [-1, 1]), unit_fc1,
                               activation=tf.nn.relu) 
    myear_fc=tf.cast(myear_fc,dtype=tf.float32)    # 为了concat转换类型

    movie_fc = tf.concat([mid_fc,mgen_fc,mtit_fc,myear_fc],axis=1)
    movie_fc=tf.layers.dense(movie_fc,unit_fc3,activation=tf.nn.relu)
    movie_fc=tf.layers.dropout(movie_fc,rate=0.3,training=is_training)

Instructions for updating:
Use keras.layers.dense instead.


最后后就是输出了：

In [8]:
logits = tf.expand_dims(tf.reduce_sum(user_fc * movie_fc,
                                      axis=1), axis=1)    # 输出分数，把向量扩成矩阵

In [9]:
with tf.name_scope('Eval'):
    loss = tf.losses.mean_squared_error(labels=Y, predictions=logits)

with tf.name_scope('train_op'):
    lr = 1e-3
    train_op = tf.train.AdamOptimizer(lr).minimize(loss)

init = tf.global_variables_initializer()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True    # 按需使用显存

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.


## 训练网络
在送入数据的时候有一点麻烦，主要是数据特征存在多重嵌套。需要把对应的列提取出来然后转化成```ndarray```送入模型。

In [10]:
def gen_feed(x_batch, y_batch):
    '''
    构造feed_dict
    '''
    feed_dict = dict()
    feed_dict[u_id] = x_batch[:, 0]
    feed_dict[u_agegen] = np.asarray(x_batch[:, 1], dtype=np.int32)
    feed_dict[u_occu] = np.asarray(x_batch[:, 2], dtype=np.int32)
    feed_dict[m_id] = np.asarray(x_batch[:, 3], dtype=np.int32)

    mtit_feed = np.zeros((len(x_batch), m_tit_size))
    for i in range(len(x_batch)):
        mtit_feed[i] = x_batch[i, 4]
    feed_dict[m_tit] = np.asarray(mtit_feed, dtype=np.int32)

    mgen_feed = np.zeros((len(x_batch), m_gens_size))
    for i in range(len(x_batch)):
        mgen_feed[i] = x_batch[i, 5]
    feed_dict[m_gen] = np.asarray(mgen_feed, dtype=np.int32)

    feed_dict[m_year] = x_batch[:, 6]

    feed_dict[Y] = y_batch.reshape((-1,1))
    
    return feed_dict

In [12]:
import numpy as np

with tf.Session(config=config) as sess:
    sess.run(init)
    epochs = 20

    batch_cnt = 0
    for epoch in range(epochs):
        for batch_data, batch_labels in train_data.next_batch():
            batch_cnt += 1
            feed_dict = gen_feed(batch_data, batch_labels)
            feed_dict[is_training]=True
            loss_val, acc_val = sess.run(
                [loss, train_op], feed_dict=feed_dict)

            # 每1000batch输出一次信息
            if (batch_cnt+1) % 1000 == 0:
                print('epoch: {}, batch_loss: {}'.format(
                    epoch, loss_val))

            # 每5000batch做一次验证
            if (batch_cnt+1) % 5000 == 0:
                all_test_acc_val = list()
                for test_batch_data, test_batch_labels in test_data.next_batch():
                    feed_dict = gen_feed(test_batch_data, test_batch_labels)
                    feed_dict[is_training]=False
                    test_acc_val = sess.run([loss], feed_dict=feed_dict)
                    all_test_acc_val.append(test_acc_val)
                test_acc = np.mean(all_test_acc_val)
                print('epoch: {}, test_MSE: {}'.format(epoch, test_acc))

epoch: 0, batch_loss: 1.12532639503479
epoch: 1, batch_loss: 0.9347590208053589
epoch: 1, batch_loss: 1.0483558177947998
epoch: 2, batch_loss: 1.0349704027175903
epoch: 3, batch_loss: 0.9728089570999146
epoch: 3, test_MSE: 0.8450213074684143
epoch: 3, batch_loss: 1.0153937339782715
epoch: 4, batch_loss: 0.8476366996765137
epoch: 5, batch_loss: 0.949594259262085
epoch: 5, batch_loss: 0.9386029243469238
epoch: 6, batch_loss: 1.0169233083724976
epoch: 6, test_MSE: 0.8372696042060852
epoch: 7, batch_loss: 0.8411692380905151
epoch: 7, batch_loss: 0.8545806407928467
epoch: 8, batch_loss: 1.0171763896942139
epoch: 8, batch_loss: 0.9544245600700378
epoch: 9, batch_loss: 0.9274780750274658
epoch: 9, test_MSE: 0.8183063268661499
epoch: 10, batch_loss: 1.064489722251892
epoch: 10, batch_loss: 0.9292706251144409
epoch: 11, batch_loss: 0.9608153104782104
epoch: 12, batch_loss: 0.9874977469444275
epoch: 12, batch_loss: 1.015494704246521
epoch: 12, test_MSE: 0.8124629855155945
epoch: 13, batch_loss: 