# H0 Hyperparameter Tuning - ResConvLSTM 
#### Author: Jayant Verma
#### Cognibit Solutions LLP

Derived from https://arxiv.org/pdf/1610.03022.pdf, 
1. No conv(3x3)/2 used 
2. Added an extra dense layer of 256 units

83.8% on val set

In [1]:
import sys
import os
import tensorflow as tf

sys.path.append("../libs")
from classification import input_data
from classification import models
from classification import trainer
from classification import freeze
import hyperopt

  return f(*args, **kwds)


Change the data folder to use the required data folder.

### Flags

In [2]:
flags=tf.app.flags

In [3]:
flags=tf.app.flags
#Important Directories
flags.DEFINE_string('data_dir','../data/raw','Train Data Folder')
flags.DEFINE_string('summaries_dir','../summaries','Summaries Folder')
flags.DEFINE_string('train_dir','../logs&checkpoint','Directory to write event logs and checkpoint')
flags.DEFINE_string('models_dir','../models','Models Folder')
#Task Specific Parameters
flags.DEFINE_string('wanted_words','yes,no,up,down,left,right,on,off,stop,go','Wanted Words')
flags.DEFINE_float('validation_percentage',10,'Validation Percentage')
flags.DEFINE_float('testing_percentage',10,'Testing Percentage')
flags.DEFINE_integer('sample_rate',16000,'Sample Rate')
flags.DEFINE_integer('clip_duration_ms',1000,'Clip Duration in ms')
flags.DEFINE_float('window_size_ms',30,'How long each spectogram timeslice is')
flags.DEFINE_float('window_stride_ms',10.0,'How far to move in time between frequency windows.')
flags.DEFINE_integer('dct_coefficient_count',40,'How many bins to use for the MFCC fingerprint')
flags.DEFINE_float('time_shift_ms',100.0,'Range to randomly shift the training audio by in time.')

FLAGS=flags.FLAGS

### Variables

In [4]:
model_architecture='convlstm'
start_checkpoint=None
logging_interval=10
eval_step_interval=1000
save_step_interval=100000
silence_percentage=10.0
unknown_percentage=10.0
background_frequency=0.8
background_volume=0.1
train_steps='3500' #Declare  the training steps for which the learning rates will be used
learning_rate='0.0001'
batch_size=256

### Model to be optimized

In [5]:
def resCONVLSTM(inputs, model_settings, is_training, name='',conv_lstm_filter_size=4):
    """Creates a Residual ConvLSTM as in https://arxiv.org/abs/1607.06450.
        1-D Conv on feature, unidirectional rnn
        
    """
    with(tf.variable_scope('resCONVLSTM_%s' % name)):
        batch_size = tf.shape(inputs)[0]
        input_frequency_size = model_settings['dct_coefficient_count']
        input_time_size = model_settings['spectrogram_length']
        input_shape = [input_frequency_size, 1]
        conv1 = tf.contrib.rnn.ConvLSTMCell(1, input_shape, 1, [conv_lstm_filter_size], name='conv1')
        conv2 = tf.contrib.rnn.ConvLSTMCell(1, input_shape, 1, [conv_lstm_filter_size], name='conv2')
        # First ConvLSTM
        initial_conv1 = conv1.zero_state(batch_size, dtype=tf.float32)
        initial_conv2 = conv2.zero_state(batch_size, dtype=tf.float32)
        conv1_o, _ = tf.nn.dynamic_rnn(conv1, inputs, initial_state=initial_conv1)
        bn1 = tf.layers.batch_normalization(inputs, axis=2, training=is_training)
        bn1_relu = tf.nn.relu(bn1)
        conv2_o, _ = tf.nn.dynamic_rnn(conv2, bn1_relu, initial_state=initial_conv2)
        bn2 = tf.layers.batch_normalization(conv2_o, axis=2, training=is_training)
        residual = tf.add(bn2, inputs)
        output_relu = tf.nn.relu(residual)
        return output_relu


def create_multilayer_convlstm_model(fingerprint_input, model_settings, is_training,conv_lstm_filter_size=4,lstm_size=256,dense_size=256):
    """
        Creates a Multilayer ConvLSTM Model Followed by a linear layer and softmax activation function



    """
    if is_training:
        dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
    batch_size = tf.shape(fingerprint_input)[0]
    input_frequency_size = model_settings['dct_coefficient_count']
    input_time_size = model_settings['spectrogram_length']
    fingerprint_4d = tf.reshape(fingerprint_input,
                                [-1, input_time_size, input_frequency_size, 1])

    # Layer1 resCONVLSTMs
    resCONVLSTM1 = resCONVLSTM(fingerprint_4d, model_settings, is_training, '1',conv_lstm_filter_size)
    resCONVLSTM2 = resCONVLSTM(resCONVLSTM1, model_settings, is_training, '2',conv_lstm_filter_size)
    resCONVLSTM3 = resCONVLSTM(resCONVLSTM2, model_settings, is_training, '3',conv_lstm_filter_size)
    resCONVLSTM4= resCONVLSTM(resCONVLSTM3,model_settings,is_training,'4',conv_lstm_filter_size)
    resCONVLSTM4=tf.reshape(resCONVLSTM4,[-1, input_time_size, input_frequency_size])
    with tf.variable_scope('lstm1'):
        lstm_cell1=tf.contrib.rnn.LSTMCell(num_units=lstm_size,num_proj=input_frequency_size)
        initial_lstm1=lstm_cell1.zero_state(batch_size,dtype=tf.float32)
        lstm1_o,_=tf.nn.dynamic_rnn(lstm_cell1,resCONVLSTM4,initial_state=initial_lstm1)
        lstm1_o=tf.reshape(lstm1_o,[-1, input_time_size, input_frequency_size,1 ])
        nin1_o=tf.layers.conv2d(lstm1_o,1,[1,1],name='nin1')
        bn1=tf.layers.batch_normalization(nin1_o, axis=2, training=is_training)
        bn1=tf.reshape(bn1,[-1,input_time_size,input_frequency_size])
    with tf.variable_scope('lstm2'):
        lstm_cell2=tf.contrib.rnn.LSTMCell(num_units=lstm_size,num_proj=input_frequency_size)
        initial_lstm2=lstm_cell1.zero_state(batch_size,dtype=tf.float32)
        lstm2_o, _ = tf.nn.dynamic_rnn(lstm_cell2, bn1, initial_state=initial_lstm2)
        lstm2_o = tf.reshape(lstm2_o, [-1, input_time_size, input_frequency_size, 1])
        nin2_o = tf.layers.conv2d(lstm2_o, 1, [1, 1], name='nin1')
        bn2 = tf.layers.batch_normalization(nin2_o, axis=2, training=is_training)
        bn2=tf.reshape(bn2,[-1,input_time_size,input_frequency_size])

    # LSTM Layer Final
    with tf.variable_scope('lstm3'):
        lstm_cell3=tf.contrib.rnn.LSTMCell(num_units=lstm_size,num_proj=input_frequency_size)
        initial_lstm3=lstm_cell1.zero_state(batch_size,dtype=tf.float32)
        lstm3_o, _ = tf.nn.dynamic_rnn(lstm_cell3, bn2, initial_state=initial_lstm3)
        lstm3_o = tf.reshape(lstm3_o, [-1, input_time_size, input_frequency_size, 1])


    # Final FC for classification
    reshaped_layer = tf.reshape(lstm3_o,
                                 [-1, input_time_size * input_frequency_size])

    # Dropout
    if is_training:
        reshaped_layer = tf.nn.dropout(reshaped_layer, keep_prob=dropout_prob)


    prefinal_dense=tf.nn.relu(tf.layers.dense(reshaped_layer,dense_size))

    if is_training:
        prefinal_dense=tf.nn.dropout(prefinal_dense,keep_prob=dropout_prob)
    # Final Layer

    label_count = model_settings['label_count']

    final_fc_weights = tf.Variable(
        tf.truncated_normal(
            [dense_size, label_count], stddev=0.01))
    final_fc_bias = tf.Variable(tf.zeros([label_count]))
    final_fc = tf.matmul(prefinal_dense, final_fc_weights) + final_fc_bias
    if is_training:
        return final_fc, dropout_prob
    else:
        return final_fc

SyntaxError: invalid syntax (<ipython-input-5-f44c564acc13>, line 1)

### Objective Function

In [None]:
def objective(args):
    conv_lstm_filter_size=int(args['conv_lstm_filter_size'])
    lstm_size=int(args['lstm_size'])
    dense_size=int(args['dense_size'])
    dropout=args['dropout']
    print('Eval Start')
    print(conv_lstm_filter_size,lstm_size,dense_size,dropout)
    tf.reset_default_graph()
    train_dir=os.path.join(FLAGS.data_dir,'train','audio')
    model_settings = models.prepare_model_settings(
      len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
      FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
      FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
    audio_processor = input_data.AudioProcessor(
      train_dir, silence_percentage, unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)
    
    def get_train_data(args):
        sess=args
        time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
        train_fingerprints, train_ground_truth = audio_processor.get_data(
            batch_size, 0, model_settings,background_frequency,
            background_volume, time_shift_samples, 'training', sess)
        return train_fingerprints,train_ground_truth
    
    def get_val_data(args):
        '''
        Input: (sess,offset)
        '''
        sess,i=args
        validation_fingerprints, validation_ground_truth = (
                audio_processor.get_data(batch_size, i, model_settings, 0.0,
                                         0.0, 0, 'validation', sess))
        return validation_fingerprints,validation_ground_truth
    
    with tf.Session() as sess:
        # Placeholders
        fingerprint_size = model_settings['fingerprint_size']
        label_count = model_settings['label_count']
        fingerprint_input = tf.placeholder(
          tf.float32, [None, fingerprint_size], name='fingerprint_input')
        ground_truth_input = tf.placeholder(
          tf.float32, [None, label_count], name='groundtruth_input')
        set_size = audio_processor.set_size('validation')
        label_count = model_settings['label_count']
        # Create Model

        logits, dropout_prob = create_multilayer_convlstm_model(
          fingerprint_input,
          model_settings,
          True,
          conv_lstm_filter_size,
          lstm_size,
          dense_size)
        #Start Training
        extra_args=(dropout_prob,label_count,batch_size,set_size)
        val_acc=trainer.train(sess,logits,fingerprint_input,ground_truth_input,get_train_data,
                      get_val_data,train_steps,learning_rate,eval_step_interval, logging_interval=logging_interval,
                      start_checkpoint=None,checkpoint_interval=None,
                      model_name=model_architecture,train_dir=None,
                      summaries_dir=None,dropout=dropout,args=extra_args)
    return 1-val_acc

### Optimization

In [None]:
space = {
        'conv_lstm_filter_size': hyperopt.hp.uniform('conv_lstm_filter_size', 4,20),
        'lstm_size': hyperopt.hp.uniform('lstm_size', 128,756),
        'dense_size': hyperopt.hp.uniform('dense_size', 128,756),
        'dropout':hyperopt.hp.uniform('dropout',0.3,1)
    }


In [None]:
best_model=hyperopt.fmin(objective, space, algo=hyperopt.tpe.suggest, max_evals=20)

In [None]:
print('The best selected Hyperparameters')
print(hyperopt.space_eval(space, best_model))