In [1]:
# Import necessary package.
import tensorflow as tf
import numpy as np  
import matplotlib.pyplot as plt
import pandas as pd

### Data Preparation.

In [2]:
# Read data from csv file.
anime_data = pd.read_csv('data/anime.csv')
rating_data = pd.read_csv('data/rating.csv')

In [3]:
# Convert data into UxM-Matrix, with unknown ratings as zeros.
rating_data = rating_data.drop_duplicates(subset=['user_id', 'anime_id'], keep='first') # Delete the duplicate with error.
rating_matrix = rating_data.reset_index().pivot(index='user_id', columns='anime_id', values='rating')

In [4]:
# Fill Na with -1.
rating_matrix = rating_matrix.fillna(-1)

In [5]:
# The data shape.
rating_matrix.shape

(73515, 11200)

In [6]:
# Examples of rating matrix.
rating_matrix[0:3]

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [87]:
# Make sure that there is no row with all -1.
sum(rating_matrix.apply(lambda x: x.sum(), axis=1)==-11200)

3915

In [7]:
# Split data into train and validation data.
validation_matrix = rating_matrix[0:10000]
train_matrix = rating_matrix[10000:]

### Network Structure.

In [8]:
# Define global variables.
FLAGS = tf.app.flags.FLAGS

# Set global variables.
tf.app.flags.DEFINE_integer('num_visible', 11200,
                            'Number of visible neurons (Number of movies the users rated.)')
tf.app.flags.DEFINE_integer('num_hidden_1', 2280,
                            'Number of hidden layer_1 neurons.)')
tf.app.flags.DEFINE_integer('num_hidden_2', 810,
                            'Number of hidden layer_2 neurons.)')
tf.app.flags.DEFINE_integer('num_hidden_3', 128,
                            'Number of hidden layer_2 neurons.)')
tf.app.flags.DEFINE_boolean('l2_reg', True,
                            'L2 regularization.')
tf.app.flags.DEFINE_float('lambda_',0.01,
                          'Wight decay factor.')
tf.app.flags.DEFINE_float('learning_rate',0.0005,
                          'Learning_Rate')
tf.app.flags.DEFINE_integer('num_epoch', 300,
                            'Number of training epochs.')
tf.app.flags.DEFINE_integer('batch_size', 32,
                            'Size of the training batch.')
tf.app.flags.DEFINE_string('tf_model_train_path', 
                           'model/',
                           'Path of the saved training model.')

In [9]:
# Define the Deep AutoEncoder.
# Define weight and bias.
with tf.name_scope('weights'):
    weight_1 = tf.get_variable(name='weight_1', shape=(FLAGS.num_visible, FLAGS.num_hidden_1), 
                               initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05))
    weight_2 = tf.get_variable(name='weight_2', shape=(FLAGS.num_hidden_1, FLAGS.num_hidden_2), 
                               initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05))
    weight_3 = tf.get_variable(name='weight_3', shape=(FLAGS.num_hidden_2, FLAGS.num_hidden_3), 
                               initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05))
    weight_4 = tf.get_variable(name='weight_4', shape=(FLAGS.num_hidden_3, FLAGS.num_hidden_2), 
                               initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05))
    weight_5 = tf.get_variable(name='weight_5', shape=(FLAGS.num_hidden_2, FLAGS.num_hidden_1), 
                               initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05))
    weight_6 = tf.get_variable(name='weight_6', shape=(FLAGS.num_hidden_1, FLAGS.num_visible), 
                               initializer=tf.random_normal_initializer(mean=0.0, stddev=0.05))
with tf.name_scope('biases'):
    bias_1 = tf.get_variable(name='bias_1', shape=(FLAGS.num_hidden_1), 
                             initializer=tf.zeros_initializer())
    bias_2 = tf.get_variable(name='bias_2', shape=(FLAGS.num_hidden_2), 
                             initializer=tf.zeros_initializer())
    bias_3 = tf.get_variable(name='bias_3', shape=(FLAGS.num_hidden_3), 
                             initializer=tf.zeros_initializer())
    bias_4 = tf.get_variable(name='bias_4', shape=(FLAGS.num_hidden_2), 
                             initializer=tf.zeros_initializer())
    bias_5 = tf.get_variable(name='bias_5', shape=(FLAGS.num_hidden_1), 
                             initializer=tf.zeros_initializer())

In [10]:
# Define inference of Neural Network.
ratings = tf.placeholder(tf.float32, [None, FLAGS.num_visible],
                         name='input_ratings')
with tf.name_scope('inference'):
    infer_1 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(ratings, weight_1), bias_1))
    infer_2 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(infer_1, weight_2), bias_2))
    infer_3 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(infer_2, weight_3), bias_3))
    infer_4 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(infer_3, weight_4), bias_4))
    infer_5 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(infer_4, weight_5), bias_5))
    output = tf.matmul(infer_5, weight_6)

In [11]:
# Count the number of training labels.
mask = tf.where(tf.equal(ratings, -1.0), ratings, tf.zeros_like(ratings)) 
num_train_labels = tf.cast(tf.count_nonzero(mask), dtype=tf.float32) 
# Mask output with -1.
output = tf.where(tf.equal(ratings, -1.0), output, tf.zeros_like(output) - 1.0) 

In [12]:
# Compute the loss of neural network.
with tf.name_scope('loss'):
    MSE_loss = tf.div(tf.reduce_sum(tf.square(tf.subtract(output, ratings))), num_train_labels)
    RMSE_loss = tf.sqrt(MSE_loss)  # Set RMSE loss for evaluation.

In [13]:
# Add L2 regularation to loss.
if FLAGS.l2_reg == True:
    l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
    MSE_loss = MSE_loss + FLAGS.lambda_ * l2_loss

In [14]:
# Set train operation to minimize MSE loss.
train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(MSE_loss)

In [15]:
# Set training number.
n_training_example = len(train_matrix)
saver = tf.train.Saver()
# Create session to run the Neural Network.
with tf.Session() as sess:
    # Initialize variables.
    init = tf.global_variables_initializer()
    sess.run(init)
    
    # Start training.
    start = 0
    end = FLAGS.batch_size
    for i in xrange(FLAGS.num_epoch):  # FLAGS.num_epoch
        # Update pointed variables but not all the variables.
        sess.run(train_op, feed_dict={
            ratings: train_matrix[start: end]
        })

        # Output log.
        if i % 20 == 0 or i + 1 == FLAGS.num_epoch:
            # saver.save(sess, FLAGS.tf_model_train_path, global_step=i)
            validation_accuracy = sess.run(RMSE_loss, feed_dict={
                ratings: validation_matrix
            })
            print('Step %d: Validation RMSE_loss = %.6f' % (i, validation_accuracy))

        # Update batch variables.
        start = end
        if start == n_training_example:
            start = 0
        end = start + FLAGS.batch_size
        if end > n_training_example:
            end = n_training_example

Step 0: Validation RMSE_loss = 1.602350
Step 20: Validation RMSE_loss = 0.899326
Step 40: Validation RMSE_loss = 0.826502
Step 60: Validation RMSE_loss = 0.800213
Step 80: Validation RMSE_loss = 0.788404
Step 100: Validation RMSE_loss = 0.784320
Step 120: Validation RMSE_loss = 0.782637
Step 140: Validation RMSE_loss = 0.781868
Step 160: Validation RMSE_loss = 0.781486
Step 180: Validation RMSE_loss = 0.781268
Step 200: Validation RMSE_loss = 0.781145
Step 220: Validation RMSE_loss = 0.781066
Step 240: Validation RMSE_loss = 0.781014
Step 260: Validation RMSE_loss = 0.780981
Step 280: Validation RMSE_loss = 0.780955
Step 299: Validation RMSE_loss = 0.780938
