In [1]:
import tensorflow as tf
import numpy as np
import dataset
import pickle
from dataset import DataSet
import os
from collections import Counter
import pandas as pd
from bayes_opt import BayesianOptimization

Parameters to optimize:
- Number of hidden Neurons
- Init stdev
- Learning rate
- optimizer
- rnn_type
- batch_size
- embedding_size
- dropout
- l2
- layer_norm

In [2]:
name_model_base = 'BO_Attention_RNN_Adam_'
num_model = 0

In [3]:
# Some auxilar functions
def _seq_length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

def _last_relevant(output, length):
    batch_size = tf.shape(output)[0]
    max_length = tf.shape(output)[1]
    out_size = int(output.get_shape()[2])
    index = tf.range(0, batch_size) * max_length + (length - 1)
    flat = tf.reshape(output, [-1, out_size])
    relevant = tf.gather(flat, index)

    return relevant

In [4]:
def create_model(parameters):
    
    print(parameters)
    
    tf.reset_default_graph()
    # Define placeholders
    x = tf.placeholder("float", [None, parameters['seq_length'], parameters['n_input']], name='x')
    y = tf.placeholder("float", [None, parameters['n_output']], name='y')
    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

    # Define weights and bias - For now we will try with attention to hidden state 
    weights = {
        'alphas': tf.Variable(tf.random_normal([parameters['n_hidden'], 1], stddev=parameters['init_stdev']), name='w_alphas'),
        'out': tf.Variable(tf.random_normal([parameters['embedding_size'], parameters['n_output']], stddev=parameters['init_stdev']), name='w_out'),
        'emb': tf.Variable(tf.random_normal([parameters['n_input'], parameters['embedding_size']], stddev=parameters['init_stdev']), name='w_emb')
    }

    biases = {
        'out': tf.Variable(tf.random_normal([parameters['n_output']]), name='b_out'),
        'alphas': tf.Variable(tf.random_normal([1]), name='b_alphas'),
        'emb': tf.Variable(tf.random_normal([parameters['embedding_size']]), name='b_emb')
    }

    # Compute embeddings
    x_reshaped = tf.reshape(x, [-1, int(x.get_shape()[2])])
    if parameters['embedding_activation'] == 'linear':
        v = tf.matmul(x_reshaped, weights['emb'])
    elif parameters['embedding_activation'] == 'tanh':
        v = tf.tanh(tf.matmul(x_reshaped, weights['emb']) + biases['emb'])
    elif parameters['embedding_activation'] == 'sigmoid':
        v = tf.sigmoid(tf.matmul(x_reshaped, weights['emb']) + biases['emb'])
    v_reshaped = tf.reshape(v, [-1, parameters['seq_length'], parameters['embedding_size']])
    if parameters['layer_norm']:
        v_reshaped = tf.contrib.layers.layer_norm(v_reshaped)

    # Define RNN
    if parameters['rnn_type'].lower() == 'lstm':
        rnn_cell = tf.contrib.rnn.BasicLSTMCell(parameters['n_hidden'], forget_bias=1.0)
    elif parameters['rnn_type'] == 'lstm2':
        rnn_cell = tf.contrib.rnn.LSTMCell(parameters['n_hidden'])
    elif parameters['rnn_type'].lower() == 'gru':
        rnn_cell = tf.contrib.rnn.GRUCell(parameters['n_hidden'])
    elif parameters['rnn_type'] == 'rnn':
        rnn_cell = tf.contrib.rnn.BasicRNNCell(parameters['n_hidden'])
    elif parameters['rnn_type'] == 'lstm_normalized':
        rnn_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(parameters['n_hidden'])
    #Add dropout
    if parameters['dropout'] > 0:
        rnn_cell = tf.contrib.rnn.DropoutWrapper(rnn_cell, output_keep_prob=dropout_keep_prob)
    outputs, states = tf.nn.dynamic_rnn(
        rnn_cell,
        v_reshaped,
        dtype=tf.float32,
        sequence_length=_seq_length(v_reshaped)
    )


    # Define attention weihts
    outputs_reshaped = tf.reshape(outputs, [-1, int(outputs.get_shape()[2])])
    ejs = tf.matmul(outputs_reshaped, weights['alphas']) + biases['alphas'] 
    ejs_reshaped = tf.reshape(ejs, [-1, int(outputs.get_shape()[1])])
    alphas = tf.nn.softmax(ejs_reshaped, name='attention_weights') 
    reshaped_alphas = tf.reshape(alphas, [-1, 1])
    # Define context
    context = reshaped_alphas * v
    context_reshaped = tf.reshape(context, [-1, parameters['seq_length'], int(context.get_shape()[1])])
    context_reduced = tf.reduce_sum(context_reshaped, axis= 1)

    # Normalize context by number of timesteps?
    # Define logits and loss
    logits = tf.matmul(context_reduced, weights['out']) + biases['out']
    #pred_prob = tf.nn.softmax(logits, name="predictions") # SIGMOID!!!!!!!!
    pred_prob = tf.sigmoid(logits, name="predictions")
    #loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)) SIGMOID!!!
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y))

    # L2 regularization
    for var in tf.trainable_variables():
        if ('b_out' not in var.name) and ('b_alphas' not in var.name) and ('b_emb' not in var.name) and ('bias' not in var.name) and ('LayerNorm' not in var.name):
            print('Variable ' + var.name + ' will be regularized')
            loss += parameters['l2'] * tf.nn.l2_loss(var)


    #Define optimizer
    if parameters['optimizer'].lower() == 'sgd':
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=parameters['learning_rate']).minimize(loss)
    elif parameters['optimizer'].lower() == 'adam':
        optimizer = tf.train.AdamOptimizer(learning_rate=parameters['learning_rate']).minimize(loss)
    elif parameters['optimizer'].lower() == 'adadelta':
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=parameters['learning_rate']).minimize(loss)
    elif parameters['optimizer'].lower() == 'adagrad':
        optimizer = tf.train.AdagradOptimizer(learning_rate=parameters['learning_rate']).minimize(loss)

    # Initialization
    init = tf.global_variables_initializer()

    #Add summaries
    tf.summary.scalar('loss', loss)
    # Create summaries to visualize weights
    for var in tf.trainable_variables():
        tf.summary.histogram(var.name, var)
    # Summarize all gradients
    grads = tf.gradients(loss, tf.trainable_variables())
    grads = list(zip(grads, tf.trainable_variables()))
    for grad, var in grads:
        if grad is not None:
            tf.summary.histogram(var.name + '/gradient', grad)
            
    return x,y,dropout_keep_prob,loss,init,optimizer


In [5]:
def load_dataset():
    with open("preprocessed/dataset_augmented.pickle", 'rb') as handle:
        dataset = pickle.load(handle)
    ds = DataSet(dataset)
    del dataset
    return ds

In [6]:
def train_model(name_model, parameters, x, y, dropout_keep_prob, loss, init, optimizer, ds):
    path_export_model = "protobuf_models/" + name_model + "/"
    display_train_loss = 200
    steps_periodic_checkpoint = 200
    current_epoch = 0

    # Start training
    saver_last = tf.train.Saver()
    saver_best = tf.train.Saver()
    checkpoint_dir = './checkpoints/' + name_model + '/'
    if not tf.gfile.Exists(checkpoint_dir):
        tf.gfile.MakeDirs(checkpoint_dir)
        tf.gfile.MakeDirs(checkpoint_dir + '/best_model')
        tf.gfile.MakeDirs(checkpoint_dir + '/last_model')
    best_loss = 150000000
    with tf.Session() as sess:

        # Create FileWriters for summaries
        merged = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter('tensorboard/' + name_model + '/train', sess.graph)
        val_writer = tf.summary.FileWriter('tensorboard/' + name_model + '/val', sess.graph)

        # Run the initializer
        sess.run(init)

        step = 1
        while ds.get_current_epoch('train') < parameters['num_epochs']:

            # Get next batch
            batch_x, batch_y = ds.next_batch(parameters['batch_size'])

            # Run optimization op (backprop)
            _ = sess.run(optimizer, feed_dict={x: batch_x, y: batch_y, dropout_keep_prob: (1 - parameters['dropout'])})

            # Compute train loss
            if step % display_train_loss == 0 or step == 1:
                # Calculate batch loss
                train_loss, summary = sess.run([loss, merged], feed_dict={x: batch_x, y: batch_y, dropout_keep_prob: 1})
                print("Step " + str(step) + ", Train Loss: " + str(train_loss))
                train_writer.add_summary(summary, step * parameters['batch_size'])


            # Periodic model checkpoint
            if step % steps_periodic_checkpoint == 0:
                checkpoint_dir_tmp = checkpoint_dir + '/last_model/'
                checkpoint_path = os.path.join(checkpoint_dir_tmp, 'model_last.ckpt')
                saver_last.save(sess, checkpoint_path, global_step=step*parameters['batch_size'])

            # Compute val loss and save model at the end of each epoch
            if ds.get_current_epoch('train') != current_epoch:
                current_epoch = ds.get_current_epoch('train')
                X_val, Y_val = ds.get_set('val')
                val_loss, summary = sess.run([loss, merged], feed_dict={x: batch_x, y: batch_y, dropout_keep_prob: 1})
                print("----End epoch " + str(current_epoch - 1) + ", Val Loss: " + str(val_loss))
                val_writer.add_summary(summary, step * parameters['batch_size'])

                # Check if validation loss is better
                if val_loss < best_loss:
                    best_loss = val_loss
                    checkpoint_dir_tmp = checkpoint_dir + '/best_model/'
                    checkpoint_path = os.path.join(checkpoint_dir_tmp, 'model_best.ckpt')
                    saver_best.save(sess, checkpoint_path, global_step=step*parameters['batch_size'])


                # Saved Model Builder 
                export_path = path_export_model + "epoch" + str(current_epoch - 1)
                builder = tf.saved_model.builder.SavedModelBuilder(export_path)
                builder.add_meta_graph_and_variables(
                      sess, [tf.saved_model.tag_constants.SERVING])
                builder.save()
                
                if (current_epoch > 1) and (val_loss > 2):
                    return -best_loss
                if (current_epoch > 3) and (val_loss > 0.4):
                    return -best_loss
                if (current_epoch > 6) and (val_loss > 0.2):
                    return -best_loss
                if (current_epoch > 8) and (val_loss > 0.10):
                    return -best_loss
                if (current_epoch > 10) and (val_loss > 0.08):
                    return -best_loss
                if (current_epoch > 12) and (val_loss > 0.06):
                    return -best_loss
                if (current_epoch > 14) and (val_loss > 0.04):
                    return -best_loss
                if (current_epoch > 16) and (val_loss > 0.03):
                    return -best_loss

            step += 1

        print("Optimization Finished!")
    return -best_loss
   

In [7]:
def model_evaluate(n_hidden,
                 init_stdev,
                 learning_rate,
                 optimizer,
                 rnn_type,
                 batch_size,
                  embedding_size,
                  dropout,
                  l2,
                  layer_norm):

    global num_model
    name_model = name_model_base + str(num_model)
    num_model += 1
    
    parameters = {}
    parameters['seq_length'] = 18
    parameters['n_input'] = 48
    parameters['n_output'] = 24
    parameters['n_hidden'] = int(n_hidden)
    parameters['init_stdev'] = init_stdev
    parameters['learning_rate'] = learning_rate
    parameters['optimizer'] = 'adam' #optimizer
    parameters['rnn_type'] = 'lstm2'#rnn_type
    parameters['num_epochs'] = 20
    parameters['batch_size'] = int(batch_size)
    parameters['embedding_size'] = int(embedding_size)
    parameters['embedding_activation'] = 'linear'
    parameters['dropout'] = dropout
    parameters['l2'] = l2
    parameters['layer_norm'] = False#layer_norm # Seems to work with 0.001 L2
    
    
    x,y,dropout_keep_prob,loss,init,optimizer = create_model(parameters)
    ds = load_dataset()
    best_loss = train_model(name_model, parameters, x, y, dropout_keep_prob, loss, init, optimizer, ds)

    return best_loss
    
    

In [8]:
num_iter = 30
init_points = 5


rnnBO = BayesianOptimization(model_evaluate, {'n_hidden': (25, 250),
                                              'init_stdev': (0.0001, 1),
                                              'learning_rate': (0.0001, 1),
                                              'optimizer': (0,1), # We could try to bin
                                              #'optimizer': ['adam', 'sgd', 'adadelta', 'adagrad'],
                                              'rnn_type': (0,1), # We could try to bin
                                              #'rnn_type': ['lstm', 'lstm2', 'gru', 'lstm_normalized'],
                                              'batch_size': (32,256),
                                              #'batch_size': [32, 64, 128, 256],
                                              'embedding_size': (8, 128),
                                              'dropout': (0, 0.75),
                                              'l2': (0, 0.1),
                                              'layer_norm': (0,1) # We could try to bin
                                              #'layer_norm': [False, True]
                                              
                                            })

                                                            
rnnBO.maximize(init_points=init_points, n_iter=num_iter)
print('Final Results')
print(rnnBO.res['max']['max_val'])




[31mInitialization[0m
[94m-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   batch_size |   dropout |   embedding_size |   init_stdev |        l2 |   layer_norm |   learning_rate |   n_hidden |   optimizer |   rnn_type | 
{'embedding_activation': 'linear', 'layer_norm': False, 'n_input': 48, 'learning_rate': 0.75837967036565068, 'optimizer': 'adam', 'batch_size': 54, 'dropout': 0.46036045735240427, 'init_stdev': 0.53764626449830077, 'num_epochs': 20, 'n_hidden': 179, 'seq_length': 18, 'n_output': 24, 'rnn_type': 'lstm2', 'l2': 0.079842876049038308, 'embedding_size': 45}
Variable w_alphas:0 will be regularized
Variable w_out:0 will be regularized
Variable w_emb:0 will be regularized
Variable rnn/lstm_cell/kernel:0 will be regularized
INFO:tensorflow:Summary name w_alphas:0 is illegal; using w_alphas_0 instead.
INFO:tensorflow

KeyboardInterrupt: 

In [11]:
rnnBO.res.keys()

dict_keys(['max', 'all'])

In [12]:
rnnBO.res['max']

{'max_params': {'batch_size': 32.647807387428223,
  'dropout': 0.75,
  'embedding_size': 127.95092535699872,
  'init_stdev': 0.0001,
  'l2': 0.10000000000000001,
  'layer_norm': 1.0,
  'learning_rate': 1.0,
  'n_hidden': 170.41140615571859,
  'optimizer': 0.91583900091641623,
  'rnn_type': 1.0},
 'max_val': 8.509251594543457}

In [16]:
rnnBO.res['all']['values']

[1.0844449,
 0.044632778,
 8.3452883,
 8.0845671,
 8.2561235,
 0.038058873,
 1.8988473,
 0.15590379,
 1.8718715,
 8.2232189,
 8.5092516,
 0.21960598,
 5.9320645,
 0.61612213,
 0.083908357,
 1.4520855,
 0.19132216,
 0.23011115,
 2.28809,
 6.521883,
 2.8740845,
 1.5333037,
 0.13123024,
 3.4975901,
 0.16002724,
 5.2910728,
 2.9744067,
 0.90664864,
 4.2031875,
 5.4279366]

In [18]:
rnnBO.res['all']['params'][5]

{'batch_size': 32.0,
 'dropout': 0.0,
 'embedding_size': 128.0,
 'init_stdev': 0.0001,
 'l2': 0.0,
 'layer_norm': 0.0,
 'learning_rate': 0.0001,
 'n_hidden': 168.96377943115036,
 'optimizer': 1.0,
 'rnn_type': 0.0}

In [20]:
dir(rnnBO)

['X',
 'Y',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_acqkw',
 '_observe_point',
 'bounds',
 'dim',
 'explore',
 'f',
 'gp',
 'i',
 'init',
 'init_points',
 'initialize',
 'initialize_df',
 'initialized',
 'keys',
 'maximize',
 'pbounds',
 'plog',
 'points_to_csv',
 'random_state',
 'res',
 'set_bounds',
 'space',
 'util',
 'verbose',
 'x_init',
 'y_init']

In [23]:
variable_global = 3
def test():
    global num_model
    print(num_model)
    name_model = name_model_base + str(num_model)
    print(name_model)
    num_model += 1
test()

2
BO_Attention_RNN_2
