# Assignment 6

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib.rnn import BasicLSTMCell, LSTMStateTuple, DropoutWrapper
from tensorflow import (placeholder, cond, reduce_mean, reduce_sum, where, not_equal, ones_like,
                        zeros_like, reshape, equal, constant, cast, concat, argmax, Variable)
from tensorflow import nn
import sys; sys.path.insert(0, '..')
import os
import pickle
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import importlib

## Utility functions

In [3]:
SEED = 5
np.random.seed(SEED)
tf.set_random_seed(SEED)

weights_n = 0

def get_weights_and_bias(shape, shape_b=None, dtype=tf.float32,
        initializer_w=tf.random_normal_initializer(),
        initializer_b=tf.zeros_initializer()):
    if not shape_b:
        shape_b = shape[-1:]

    global weights_n

    weights_n += 1
    with tf.variable_scope('weights%d' % weights_n):
        return (
                tf.get_variable('W', initializer=initializer_w,
                                shape=shape, dtype=dtype),
                tf.get_variable('b', shape=shape_b, initializer=initializer_b)
                )

def get_optimizer(name):
    if isinstance(name, tf.train.Optimizer):
        return name
    else:
        return getattr(tf.train, name + 'Optimizer')

fc_n = 0


def fully_connected(input, n_out, with_activation=False, activation=tf.nn.tanh,
        use_bias=True):
    '''Create a fully connected layer with fixed activation function and variable
    initialisation. The activation function is ``tf.nn.tanh`` and variables are
    initialised from a truncated normal distribution with an stddev of 0.1

    Parameters
    ----------
    input   :   tf.Variable
                Input to the layer
    n_out   :   int
                Number of neurons in the layer
    with_activation :   bool
                        Return activation or drive (useful when planning to use
                        ``softmax_cross_entropy_with_logits`` which requires
                        unscaled logits)


    Returns
    -------
    tf.Variable
            The variable representing the layer activation (tanh(input * Weights
            + bias))
    '''
    global fc_n
    fc_n += 1
    with tf.variable_scope('fully%d' % fc_n):
        (fan_in, fan_out) = (input.shape[-1].value, n_out)
        if activation == tf.nn.tanh:
            init_W = tf.random_normal_initializer(stddev=fan_in ** (-0.5))
        elif activation == tf.nn.relu:
            init_W = tf.random_normal_initializer(stddev=2 / fan_in)
        else:
            init_W = tf.random_normal_initializer()
        init_b = tf.constant_initializer(0.1)
        W = tf.get_variable(
                'weights',
                initializer=init_W,
                shape=(input.shape[-1], n_out), # the last dim of the input
               dtype=tf.float32                 # is the 1st dim of the weights
            )
        if use_bias:
            bias = tf.get_variable('bias', initializer=init_b, shape=(n_out,))
        if use_bias:
            drive = tf.matmul(input, W) + bias
        else:
            drive = tf.matmul(input, W)
        if with_activation:
            return activation(drive)
        else:
            return drive


## IMDB Helper class

In [2]:
PICKLE_NAME = 'imdb_helper.pckl'

class IMDB:

    def load(self):
        '''Deserialise self from pickeld file.'''
        f = open(PICKLE_NAME, 'rb')
        tmp_dict = pickle.load(f)
        f.close()

        self.__dict__.update(tmp_dict)

    def save(self):
        '''Serialise self to pickeld file.'''
        f = open(PICKLE_NAME, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()


    def __init__(self, directory):
        if os.path.exists(PICKLE_NAME):
            self.load()
        else:
            self._directory = directory

            self._training_data, self._training_labels = self._load_data('train')
            self._test_data, self._test_labels = self._load_data('test')

            np.random.seed(0)
            samples_n = self._training_labels.shape[0]
            random_indices = np.random.choice(samples_n, samples_n // 7, replace = False)
            np.random.seed()

            self._validation_data = self._training_data[random_indices]
            self._validation_labels = self._training_labels[random_indices]
            self._training_data = np.delete(self._training_data, random_indices, axis = 0)
            self._training_labels = np.delete(self._training_labels, random_indices)

            joined_written_ratings = [word for text in self._training_data for word in text]
            print('Unique words: ' + str(len(Counter(joined_written_ratings))))
            print('Mean length: ' + str(np.mean([len(text) for text in self._training_data])))
            self.save()


    def _load_data(self, data_set_type):
        data = []
        labels = []
        # Iterate over conditions
        for condition in ['neg', 'pos']:
            directory_str = os.path.join(self._directory, 'aclImdb', data_set_type, condition)
            directory = os.fsencode(directory_str)

            for file in os.listdir(directory):
                filename = os.fsdecode(file)

                label = 0 if condition == 'neg' else 1
                labels.append(label)

                # Read written rating from file
                with open(os.path.join(directory_str, filename)) as fd:
                    written_rating = fd.read()
                    written_rating = written_rating.lower()
                    tokenizer = RegexpTokenizer(r'\w+')
                    written_rating = tokenizer.tokenize(written_rating)
                    data.append(written_rating)

        return np.array(data), np.array(labels)

    def create_dictionaries(self, vocabulary_size, cutoff_length):
        if not hasattr(self, '_word2id'):
            joined_written_ratings = [word for text in self._training_data for word in text]
            words_and_count = Counter(joined_written_ratings).most_common(vocabulary_size - 2)

            word2id = {word: word_id for word_id, (word, _) in enumerate(words_and_count, 2)}
            word2id['_UNKNOWN_'] = 0
            word2id['_NOT_A_WORD_'] = 1

            id2word = dict(zip(word2id.values(), word2id.keys()))

            self._word2id = word2id
            self._id2word = id2word

            self._training_data = np.array([self.words2ids(text[:cutoff_length]) for text in self._training_data])
            self._validation_data = np.array([self.words2ids(text[:cutoff_length]) for text in self._validation_data])
            self._test_data = np.array([self.words2ids(text[:cutoff_length]) for text in self._test_data])
            self.save()


    def words2ids(self, words):
        if type(words) == list or type(words) == range or type(words) == np.ndarray:
            return [self._word2id.get(word, 0) for word in words]
        else:
            return self._word2id.get(words, 0)

    def ids2words(self, ids):
        if type(ids) == list or type(ids) == range or type(ids) == np.ndarray:
            return [self._id2word.get(wordid, '_UNKNOWN_') for wordid in ids]
        else:
            return self._id2word.get(ids, '_UNKNOWN_')


    def get_training_batch(self, batch_size):
        return self._get_batch(self._training_data, self._training_labels, batch_size)

    def get_validation_batch(self, batch_size):
        return self._get_batch(self._validation_data, self._validation_labels, batch_size)

    def get_test_batch(self, batch_size):
        return self._get_batch(self._test_data, self._test_labels, batch_size)

    def _get_batch(self, data, labels, batch_size):
        samples_n = labels.shape[0]
        if batch_size <= 0:
            batch_size = samples_n

        random_indices = np.random.choice(samples_n, samples_n, replace = False)
        data = data[random_indices]
        labels = labels[random_indices]

        for i in range(samples_n // batch_size):
            on = i * batch_size
            off = on + batch_size
            yield data[on:off], labels[on:off]


    def slice_batch(self, batch, slice_size):
        max_len = np.max([len(sample) for sample in batch])
        steps = int(np.ceil(max_len / slice_size))
        max_len = slice_size * steps

        # Resize all samples in batch to same size
        batch_size = len(batch)
        # fill buffer with _NOT_A_WORD_
        buffer = np.ones((batch_size, max_len), dtype = np.int32)
        for i, sample in enumerate(batch):
            buffer[i, 0:len(sample)] = sample

        for i in range(steps):
            on = i * slice_size
            off = on + slice_size
            yield buffer[:, on:off]


    def get_sizes(self):
        training_samples_n   = self._training_labels.shape[0]
        validation_samples_n = self._validation_labels.shape[0]
        test_samples_n       = self._test_labels.shape[0]
        return training_samples_n, validation_samples_n, test_samples_n



## Model
### Optimizer Helper Class

In [3]:
class OptimizerSpec(dict):
    '''Encapsulate all the info needed for creating any kind of optimizer. Learning rate scheduling
    is fixed to exponential decay

    Attributes
    ----------
    step_counter    :   Variable
                        Counter to be passed to optimizer#minimize() so it gets incremented during
                        each update
    learning_rate   :   tf.train.piecewise_constant
                        Learning rate of the optimizer (for later retrieval)

    '''

    def __init__(self, **kwargs):
        '''
        Parameters
        ----------
        kind    :   str
                    Name of the optimizer
        learning_rate   :   float
                            Base learning rate used
        name    :   str
                    Optional name for the piecewise_constant operation
        momentum    :   float
                        Optional momentum for momentum optimizers
        use_nesterov    :   bool
                            Nesterov flag for momentum optimizer
        '''
        if not 'kind' in kwargs:
            raise ValueError('No optimizer name given')
        if not 'learning_rate' in kwargs:
            raise ValueError('No base learning_rate given')
        self.update(kwargs)
        self.step_counter  = Variable(0, trainable=False, dtype=tf.int32, name='step_counter')
        rate               = kwargs['learning_rate']
        steps              = kwargs.get('steps', 100)
        decay              = kwargs.get('decay', 0.8)
        self.learning_rate = tf.train.exponential_decay(rate, self.step_counter, steps, decay)

    def create(self):
        '''Build the Optimizer object from the properties

        Return
        ------
        tf.train.Optimizer
            Ready-made optimizer
        '''
        kind          = self['kind']
        learning_rate = self.learning_rate
        name          = self.get('name', 'optimizer')
        optimizer_cls = get_optimizer(kind)
        if kind in ['Momentum', 'RMSProp']:
            # only those two use momentum param
            try:
                momentum = self['momentum']
            except KeyError:
                raise ValueError('Momentum parameter is necessary for MomentumOptimizer')
            if kind == 'Momentum':
                if 'use_nesterov' in self:
                    use_nesterov = self['use_nesterov']
                else:
                    use_nesterov = False
                return optimizer_cls(learning_rate, momentum, use_nesterov, name=name)
            else:
                return optimizer_cls(learning_rate, momentum, name=name)
        else:
            return optimizer_cls(learning_rate, name=name)


    def __str__(self):
        key_val_str = ', '.join(str(k) + '=' + str(v) for k, v in self.items())
        return f'<Optimizer: {key_val_str}>'


### IMDB Model

In [4]:
class IMDBModel(object):
    '''Model for IMBD movie review classification.'''

    def __init__(self, **kwargs):
        '''The following arguments are accepted:

        Parameters
        ----------
        vocab_size  :   int
                        Size of the vocabulary for creating embeddings
        embedding_matrix    :   int
                                Dimensionality of the embedding space
        memory_size :   int
                        LSTM memory size
        keep_prob   :   Inverse of dropout percentage for embedding and LSTM
        subsequence_length  :   Length of the subsequences (all embeddings are padded to this length)
        optimizer   :   OptimizerSpec

        '''
        ############################################################################################
        #                                 Get all hyperparameters                                  #
        ############################################################################################
        vocab_size         = kwargs['vocab_size']
        embedding_size     = kwargs['embedding_size']
        memory_size        = kwargs['memory_size']
        keep_prob          = kwargs['keep_prob']
        subsequence_length = kwargs['subsequence_length']
        optimizer_spec     = kwargs['optimizer']
        optimizer          = optimizer_spec.create()
        self.learning_rate = optimizer_spec.learning_rate
        self.step_counter  = optimizer_spec.step_counter

        ############################################################################################
        #                                        Net inputs                                        #
        ############################################################################################
        self.batch_size   = placeholder(tf.int32,   shape=[],                  name='batch_size')
        self.is_training  = placeholder(tf.bool,    shape=[],                  name='is_training')
        self.word_ids     = placeholder(tf.int32,   shape=(None, subsequence_length),
                                                                               name='word_ids')
        self.labels       = placeholder(tf.int32,   shape=(None,),             name='labels')
        self.hidden_state = placeholder(tf.float32, shape=(None, memory_size), name='hidden_state')
        self.cell_state   = placeholder(tf.float32, shape=(None, memory_size), name='cell_state')

        lengths = sequence_lengths(self.word_ids)

        ############################################################################################
        #                                        Embedding                                         #
        ############################################################################################
        self.embedding_matrix, _bias = get_weights_and_bias((vocab_size, embedding_size))
        embeddings = cond(self.is_training,
                         lambda: nn.dropout(
                             nn.embedding_lookup(self.embedding_matrix, self.word_ids),
                             keep_prob=keep_prob),
                         lambda: nn.embedding_lookup(self.embedding_matrix, self.word_ids)
                         )

        ############################################################################################
        #                                        LSTM layer                                        #
        ############################################################################################
        cell = BasicLSTMCell(memory_size, activation=tf.nn.tanh)

        # during inference, use entire ensemble
        keep_prob = cond(self.is_training, lambda: constant(keep_prob), lambda: constant(1.0))
        cell      = DropoutWrapper(cell, output_keep_prob=keep_prob)

        # what's the difference to just creating a zero-filled tensor tuple?
        self.zero_state = cell.zero_state(self.batch_size, tf.float32)
        state           = LSTMStateTuple(c=self.cell_state, h=self.hidden_state)

        # A dynamic rnn creates the graph on the fly, so it can deal with embeddings of different
        # lengths. We do not need to unstack the embedding tensor to get rows, instead we compute
        # the actual sequence lengths and pass that
        # We are not sure how any of this works. Do we need to mask the cost function so the cell
        # outputs for _NOT_A_WORD_ inputs are ignored? Is the final cell state really relevant if it
        # was last updated with _NOT_A_WORD_ input? Does static_rnn absolve us of any of those
        # issues?
        outputs, self.state = nn.dynamic_rnn(cell, embeddings, sequence_length=lengths,
                                             initial_state=state)
        # Recreate tensor from list
        outputs      = reshape(concat(outputs, 1), [-1, subsequence_length * memory_size])
        self.outputs = reduce_mean(outputs)

        ############################################################################################
        #                        Fully connected layer, loss, and training                         #
        ############################################################################################
        ff1  = fully_connected(outputs, 2, with_activation=False, use_bias=True)
        loss = reduce_mean(nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                                            logits=ff1))
        self.train_step    = optimizer.minimize(loss, global_step=self.step_counter)
        self.predictions   = nn.softmax(ff1)
        correct_prediction = equal(cast(argmax(self.predictions, 1), tf.int32), self.labels)
        self.accuracy      = reduce_mean(cast(correct_prediction, tf.float32))

        ############################################################################################
        #                                    Create summaraies                                     #
        ############################################################################################
        with tf.variable_scope('summary'):
            self.summary_loss = tf.summary.scalar('loss', loss)
            self.summary_accuracy = tf.summary.scalar('accuracy', self.accuracy)


    def get_zero_state(self, session, batch_size):
        '''Retrieve the LSTM zero state.

        Parameters
        ----------
        session :   tf.Session
                    Open session to run the op in
        batch_size  :   int
                        Batch size (required for the tensor shapes, since the state cannot have
                        variable dimensions)

        Returns
        -------
        LSTMStateTuple
            Tuple of zero tensors of shape [batch_size x memory_size]
        '''
        return session.run(self.zero_state, feed_dict = {self.batch_size: batch_size})

    def run_training_step(self, session, subsequence_batch, labels, state):
        '''Run one training step.

        Parameters
        ----------
        session :   tf.Session
                    Open session to run ops in
        subsequence_batch   :   np.ndarray
                                Array of subsequences
        labels  :   np.ndarray
                    Array of labels for each batch
        state   :   LSTMStateTuple
                    LSTM memory state from the last step

        Returns
        -------
        LSTMStateTuple, Tensor
            New memory state and the summary tensor for the loss op

        '''
        state, _, summary_loss = session.run([self.state, self.train_step, self.summary_loss],
            feed_dict = {
                self.word_ids:     subsequence_batch,
                self.labels:       labels,
                self.cell_state:   state.c,
                self.hidden_state: state.h,
                self.batch_size:   subsequence_batch.shape[0],
                self.is_training:  True
            })
        return state, summary_loss

    def run_test_step(self, session, subsequence_batch, labels):
        '''Run one test step.

        Parameters
        ----------
        session :   tf.Session
                    Open session to run ops in
        subsequence_batch   :   np.ndarray
                                Array of subsequences
        labels  :   np.ndarray
                    Array of labels for each batch

        Returns
        -------
        float, Tensor
            Accuracy and the summary tensor for the accuracy on the batch

        '''
        batch_size = subsequence_batch.shape[0]
        zero_state = self.get_zero_state(session, batch_size)
        predictions, accuracy, summary_accuracy = session.run([self.predictions, self.accuracy, self.summary_accuracy],
            feed_dict = {
                self.word_ids:     subsequence_batch,
                self.labels:       labels,
                self.cell_state:   zero_state.c,
                self.hidden_state: zero_state.h,
                self.batch_size:   batch_size,
                self.is_training:  False
            })
        return accuracy, summary_accuracy


### Parameter set

In [5]:
vocabulary_size = 20000
sequence_length = 100   # Length for subsequence training.
cutoff = 300            # Cutoff length for reviews.
batch_size = 250        # Batch size
epochs = 2              # Number of epochs'
learning_rate = 0.03    # Initial learning rate (scheduling is used)
embedding_size = 64     # Embedding dimensionality'
memory_size = 64        # Memory size'
keep_probability = 0.85 # Percentage of neurons to keep during dropout'
momentum = 0.5          # Momentum (only used for Momentum optimizer)'
optimizer = 'Adam'      # Optimizer class'
decay_steps = 100       # Decay learning rate every n steps'
decay_rate = 0.8        # Base decay value for exponential decay'

### Main 

In [7]:
def sequence_lengths(sequences, padding_value=1):
    '''Find the actual sequence length for each sequence in a tensor. Sequences could be padded with
    1s if they were shorter than the cutoff length chosen.

    Parameters
    ----------
    sequences   :   tf.Tensor
                    Tensor of shape [batch_size x sequence_length]

    Returns
    -------
    tf.Tensor
        Tensor of shape [batch_size,], each value being the true length of its associated sequence
    '''
    _1 = tf.fill(tf.shape(sequences), padding_value)
    _0 = zeros_like(sequences)
    # set values != 1 to 1 and the rest to 0, so the sum is the number
    # of nonzeros
    is_padding = where(not_equal(sequences, _1), _1, _0)
    return reduce_sum(is_padding, axis=1)



def estimate_number_of_steps(train_data, sequence_length, epochs, batch_size):
    '''Get an (incorrect, but close) estimate for the number of training steps. This is useful for
    choosing a learning rate schedule.'''

    batches = int(train_data.shape[0] / batch_size + 0.5)
    max_len = np.max([len(sample) for sample in train_data])
    steps   = int(np.ceil(max_len / sequence_length)) * batches * epochs
    return steps




################################################################################################
#                                        Load the data                                         #
################################################################################################
print('Loading IMDB data')
helper = IMDB('data')
helper.create_dictionaries(vocabulary_size, cutoff)

opti_spec = OptimizerSpec(learning_rate=learning_rate, steps=decay_steps,
                          decay=decay_rate, kind=optimizer, momentum=momentum,
                          use_nesterov=True)
print(f'Using optimizer {opti_spec}')
steps      = estimate_number_of_steps(helper._training_data, sequence_length, epochs, batch_size)
print(f'Probable number of steps: {steps}')

################################################################################################
#                                     Initialise the model                                     #
################################################################################################
print('Creating model')
model = IMDBModel(vocab_size=vocabulary_size,
                  subsequence_length=sequence_length,
                  optimizer=opti_spec,
                  embedding_size=embedding_size,
                  memory_size=memory_size,
                  keep_prob=keep_probability)

summary_dir = './summary/train/'

################################################################################################
#                                       Run all the shit                                       #
################################################################################################
with tf.Session() as session:
    session.run(tf.global_variables_initializer())

    counter = 1
    train_writer = tf.summary.FileWriter(summary_dir, session.graph)

    for epoch in range(epochs):
        print(f'Starting epoch {epoch}')

        for batch_idx, (batch, labels) in enumerate(helper.get_training_batch(batch_size)):
            # reset state for each batch
            state = model.get_zero_state(session, batch_size)

            for subsequence_batch in helper.slice_batch(batch, sequence_length):
                # push one subsequence of each batch member
                state, summary_loss = model.run_training_step(session, subsequence_batch, labels, state)
                if counter % 10 == 0:
                    train_writer.add_summary(summary_loss, counter)
                counter += 1

            if batch_idx % 10 == 0:
                ###############################
                #  Test with 5000 test data.  #
                ###############################
                samples_n                  = helper._test_labels.shape[0]
                n                          = 5000
                random_indices             = np.random.choice(samples_n, n, replace=False)
                test_data, test_labels     = (helper._test_data[random_indices],
                                                 helper._test_labels[random_indices])
                test_data                  = next(helper.slice_batch(test_data, sequence_length))
                accuracy, summary_accuracy = model.run_test_step(session, test_data, test_labels)
                train_writer.add_summary(summary_accuracy, counter)
                print(f'Accuracy = {accuracy:3.3f}')

Loading IMDB data
Unique words: 70610
Mean length: 242.911288441
Using optimizer <Optimizer: learning_rate=0.03, steps=100, decay=0.8, kind=Adam, momentum=0.5, use_nesterov=True>
Probable number of steps: 516
Creating model
Starting epoch 0
Accuracy = 0.489
Accuracy = 0.565
Accuracy = 0.644
Accuracy = 0.691
Accuracy = 0.721
Accuracy = 0.765
Accuracy = 0.776
Accuracy = 0.778
Accuracy = 0.790
Starting epoch 1
Accuracy = 0.795
Accuracy = 0.791
Accuracy = 0.796
Accuracy = 0.799
Accuracy = 0.794
Accuracy = 0.792
Accuracy = 0.800
Accuracy = 0.809
Accuracy = 0.795
