# Training Deep AutoEncoders for Collaborative Filtering
https://arxiv.org/pdf/1708.01715.pdf

In [46]:
import argparse
import sys

import tensorflow as tf

import pandas as pd
import numpy as np

In [47]:
tf.__version__

'1.3.0'

## Loading Data

In [48]:
ratings = pd.read_csv('./mangaki-data-challenge-0908/watched.csv')
ratings.head()

Unnamed: 0,user_id,work_id,rating
0,717,8025,dislike
1,1106,1027,neutral
2,1970,3949,neutral
3,1685,9815,like
4,1703,3482,like


In [49]:
ratings = ratings.replace({'dislike': '1'}, regex=True)
ratings = ratings.replace({'neutral': '2'}, regex=True)
ratings = ratings.replace({'like': '3'}, regex=True)
ratings = ratings.replace({'love': '4'}, regex=True)

ratings['rating'] = ratings['rating'].astype(float)
ratings['rating'] = ratings['rating'] / 4.0

In [50]:
ratings.head()

Unnamed: 0,user_id,work_id,rating
0,717,8025,0.25
1,1106,1027,0.5
2,1970,3949,0.5
3,1685,9815,0.75
4,1703,3482,0.75


In [51]:
piv = ratings.pivot_table(index=['user_id'], columns=['work_id'], values='rating')
piv.fillna(0, inplace=True)
piv.head()

work_id,0,1,2,3,4,5,6,7,9,10,...,9885,9886,9887,9889,9890,9891,9892,9893,9894,9896
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
type(piv)

pandas.core.frame.DataFrame

In [68]:
piv.as_matrix()[2]

array([ 0.  ,  0.  ,  0.75, ...,  0.  ,  0.  ,  0.  ])

## Building Model

In [53]:
FLAGS = None

In [54]:
tf.logging.set_verbosity(tf.logging.INFO)

In [61]:
def model_fn(features, labels, mode, params):
# 1. Configure the model via TensorFlow operations
    display(features)   
    
    # Input Layer
    input_layer = features

    encoder1 = tf.layers.dense(inputs=input_layer, units=128, activation=tf.nn.selu)
    encoder2 = tf.layers.dense(inputs=encoder1, units=256, activation=tf.nn.selu)
    
    bottleneck = tf.layers.dense(inputs=encoder2, units=256)
    dropout = tf.layers.dropout(inputs=bottleneck, rate=params['dropout'], training=mode == tf.estimator.ModeKeys.TRAIN)

    decoder1 = tf.layers.dense(inputs=dropout, units=256, activation=tf.nn.selu)
    decoder2 = tf.layers.dense(inputs=decoder1, units=128, activation=tf.nn.selu)
    print('shape', input_layer.get_shape)
    output_layer = tf.layers.dense(inputs=decoder2, units=input_layer.get_shape[0], activation=None)
    
    predictions = output_layer
    predictions_dict = {"y": predictions}
    

# 2. Define the loss function for training/evaluation
    # Masked Mean Square Error 
    #mask = tf.where(input_layer != 0, 1, 0, name='loss_mask')
    #loss = tf.divide(tf.multiply(mask, tf.squared_difference(input_layer, predictions)), tf.sum(mask))
    
    weight = tf.cast(tf.greater(input_layer, 0), tf.float32)
    loss  = tf.contrib.losses.mean_squared_error(input_layer, predictions, weight)
    
# TODO Dense re-feeding
    
    eval_metric_ops = {
        "rmse": tf.sqrt(loss)
    }
# 3. Define the training operation/optimizer
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=params["learning_rate"])
    train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())


# 4. Generate predictions
# 5. Return predictions/loss/train_op/eval_metric_ops in EstimatorSpec object
    return EstimatorSpec(mode, predictions, loss, train_op, eval_metric_ops)

In [62]:
def main(unused_argv):
    # Load datasets
    global piv
    
    # Set model params
    model_params = {
        "learning_rate": FLAGS.learning_rate,
        "dropout": FLAGS.dropout
                   }

    # Instantiate Estimator
    nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)

    train_input_fn = tf.estimator.inputs.pandas_input_fn(
        piv,
        batch_size=128,
        num_epochs=None,
        shuffle=True,
        queue_capacity=1000,
        num_threads=1,
    )

    # Train
    nn.train(input_fn=train_input_fn, steps=5000)

    # Score accuracy
   # test_input_fn = tf.estimator.inputs.numpy_input_fn(
   #     x=test_set.data, # TODO test data
   #     num_epochs=1,
   #     shuffle=False)

    ev = nn.evaluate(input_fn=test_input_fn)
    print("Loss: %s" % ev["loss"])
    print("Root Mean Squared Error: %s" % ev["rmse"])
    
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.register("type", "bool", lambda v: v.lower() == "true")
    parser.add_argument("--train_data", type=str, default="", help="Path to the training data.")
    parser.add_argument("--test_data", type=str, default="", help="Path to the test data.")
    parser.add_argument("--predict_data", type=str, default="", help="Path to the prediction data.")
    
    parser.add_argument("--dropout", type=float, default=0.65, help="Dropout after the bottleneck layer.")
    parser.add_argument("--learning_rate", type=float, default=0.005, help="Learning rate.")
    
    FLAGS, unparsed = parser.parse_known_args()
    
    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpyvptnajp', '_save_checkpoints_steps': None, '_tf_random_seed': 1, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_save_checkpoints_secs': 600}


{0: <tf.Tensor 'random_shuffle_queue_DequeueMany:1' shape=(128,) dtype=float64>,
 1: <tf.Tensor 'random_shuffle_queue_DequeueMany:2' shape=(128,) dtype=float64>,
 2: <tf.Tensor 'random_shuffle_queue_DequeueMany:3' shape=(128,) dtype=float64>,
 3: <tf.Tensor 'random_shuffle_queue_DequeueMany:4' shape=(128,) dtype=float64>,
 4: <tf.Tensor 'random_shuffle_queue_DequeueMany:5' shape=(128,) dtype=float64>,
 5: <tf.Tensor 'random_shuffle_queue_DequeueMany:6' shape=(128,) dtype=float64>,
 6: <tf.Tensor 'random_shuffle_queue_DequeueMany:7' shape=(128,) dtype=float64>,
 7: <tf.Tensor 'random_shuffle_queue_DequeueMany:8' shape=(128,) dtype=float64>,
 9: <tf.Tensor 'random_shuffle_queue_DequeueMany:9' shape=(128,) dtype=float64>,
 10: <tf.Tensor 'random_shuffle_queue_DequeueMany:10' shape=(128,) dtype=float64>,
 11: <tf.Tensor 'random_shuffle_queue_DequeueMany:11' shape=(128,) dtype=float64>,
 12: <tf.Tensor 'random_shuffle_queue_DequeueMany:12' shape=(128,) dtype=float64>,
 16: <tf.Tensor 'rando

AttributeError: 'dict' object has no attribute 'dtype'