<a href="https://colab.research.google.com/github/annikamarie/annikamarie-BiDLSTM-with-Attention/blob/master/BiDLSTM_with_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
from __future__ import print_function, division

%tensorflow_version 1.13.2
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn as bi_rnn
from sklearn.metrics import precision_recall_fscore_support as score
from tensorflow.contrib.rnn import GRUCell, LSTMCell
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
from pprint import pprint
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import gensim as gs
import numpy as np
import itertools
import logging
import pickle
import json
import time
import copy
import sys
import re
import os
import sklearn as sk 

logging.getLogger().setLevel(logging.INFO)

def clean_str(s):
    return s.strip().lower()

def pad_sentences(sentences, padding_word= "<PAD/>", forced_sequence_length=541):
    """Pad setences during training or prediction"""
    if forced_sequence_length is None: # Train
        sequence_length = max(len(x) for x in sentences)
    else:
        logging.critical('This is prediction, reading the trained sequence length')
        sequence_length = forced_sequence_length
    logging.critical('The maximum length is {}'.format(sequence_length))

    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)

        if num_padding < 0: # Prediction: cut off the sentence if it is longer than the sequence length
            logging.info('This sentence has to be cut off because it is longer than trained sequence length')
            padded_sentence = sentence[0:sequence_length]
        else:
            padded_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(padded_sentence)
    return padded_sentences

def build_vocab(sentences):
    word_counts = Counter(itertools.chain(*sentences))
    vocabulary_inv = [word[0] for word in word_counts.most_common()]
    vocabulary = {word: index for index, word in enumerate(vocabulary_inv)}
    return vocabulary, vocabulary_inv


def load_embeddings(vocabulary):
    word_embeddings = {}
    for word in vocabulary:
        word_embeddings[word] = np.random.uniform(-0.25, 0.25, 200)
        return word_embeddings


def load_data(filename):
    df = pd.read_csv(filename,index_col=None, encoding='UTF-8', engine='python', dtype=str)
    selected = ['labels', 'text']
    df = df.dropna(axis=0, how='any', subset=selected)
    df = df.reindex(np.random.permutation(df.index))

    labels = sorted(list(set(df[selected[0]].tolist())))
    num_labels = len(labels)
    one_hot = np.zeros((num_labels, num_labels), int)
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))

    x_raw = df[selected[1]].apply(lambda x: clean_str(x).split(' ')).tolist()
    y_raw = df[selected[0]].apply(lambda y: label_dict[y]).tolist()

    x_raw = pad_sentences(x_raw)
    vocabulary, vocabulary_inv = build_vocab(x_raw)

    x = np.array([[vocabulary[word] for word in sentence] for sentence in x_raw])
    y = np.array(y_raw)
    return x, y, vocabulary, vocabulary_inv, df, labels

In [3]:
def dRNN(cell, inputs, rate, scope='default'):
    """
    This function constructs a layer of dilated RNN.
    Inputs:
        cell    the dilation operations is implemented independent of the RNN cell.
            In theory, any valid tensorflow rnn cell should work.
        inputs  the input for the RNN. inputs should be in the form of
            a list of 'n_steps' tenosrs. Each has shape (batch_size, input_dims)
        rate    the rate here refers to the 'dilations' in the orginal WaveNet paper.
        scope   variable scope.
    Outputs:
        outputs   the outputs from the RNN.
    """
    n_steps = len(inputs)
    if rate < 0 or rate >= n_steps:
        raise ValueError('The \'rate\' variable needs to be adjusted.')
    print("Building layer: %s, input length: %d, dilation rate: %d, input dim: %d." % (
        scope, n_steps, rate, inputs[0].get_shape()[1]))

    # make the length of inputs divide 'rate', by using zero-padding
    EVEN = (n_steps % rate) == 0
    if not EVEN:
        # Create a tensor in shape (batch_size, input_dims), which all elements are zero.
        # This is used for zero padding
        zero_tensor = tf.zeros_like(inputs[0])
        dialated_n_steps = n_steps // rate + 1
        print("=====> %d time points need to be padded. " % (
            dialated_n_steps * rate - n_steps))
        print("=====> Input length for sub-RNN: %d" % (dialated_n_steps))
        for i_pad in range(dialated_n_steps * rate - n_steps):
            inputs.append(zero_tensor)
    else:
        dialated_n_steps = n_steps // rate
        print("=====> Input length for sub-RNN: %d" % (dialated_n_steps))

    # now the length of 'inputs' divide rate
    # reshape it in the format of a list of tensors
    # the length of the list is 'dialated_n_steps'
    # the shape of each tensor is [batch_size * rate, input_dims]
    # by stacking tensors that "colored" the same

    # Example:
    # n_steps is 5, rate is 2, inputs = [x1, x2, x3, x4, x5]
    # zero-padding --> [x1, x2, x3, x4, x5, 0]
    # we want to have --> [[x1; x2], [x3; x4], [x_5; 0]]
    # which the length is the ceiling of n_steps/rate
    dilated_inputs = [tf.concat(inputs[i * rate:(i + 1) * rate],
                                axis=0) for i in range(dialated_n_steps)]

    # building a dialated RNN with reformated (dilated) inputs
    dilated_outputs, _ = tf.contrib.rnn.static_rnn(
        cell, dilated_inputs,
        dtype=tf.float32, scope=scope)

    # reshape output back to the input format as a list of tensors with shape [batch_size, input_dims]
    # split each element of the outputs from size [batch_size*rate, input_dims] to
    # [[batch_size, input_dims], [batch_size, input_dims], ...] with length = rate
    splitted_outputs = [tf.split(output, rate, axis=0)
                        for output in dilated_outputs]
    unrolled_outputs = [output
                        for sublist in splitted_outputs for output in sublist]
    # remove padded zeros
    outputs = unrolled_outputs[:n_steps]

    return outputs


def multi_dRNN_with_dilations(cells, inputs, dilations):
    """
    This function constucts a multi-layer dilated RNN.
    Inputs:
        cells      A list of RNN cells.
        inputs     A list of 'n_steps' tensors, each has shape (batch_size, input_dims).
        dilations  A list of integers with the same length of 'cells' indicates the dilations for each layer.
    Outputs:
        x     A list of 'n_steps' tensors, as the outputs for the top layer of the multi-dRNN.
    """
    assert (len(cells) == len(dilations))
    x = copy.copy(inputs)
    for cell, dilation in zip(cells, dilations):
        scope_name = "multi_dRNN_dilation_%d" % dilation
        x = dRNN(cell, x, dilation, scope=scope_name)
    return x

def _contruct_cells(hidden_structs, cell_type,dropouts):
    """
    This function contructs a list of cells.
    """
    # error checking
    if cell_type not in ["RNN", "LSTM", "GRU"]:
        raise ValueError("The cell type is not currently supported.")

    # define cells
    cells = []
    for hidden_dims in hidden_structs:
        if cell_type == "RNN":
            cell = tf.contrib.rnn.BasicRNNCell(hidden_dims)
        elif cell_type == "LSTM":
            cell = tf.contrib.rnn.BasicLSTMCell(hidden_dims)
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=dropouts)
        elif cell_type == "GRU":
            cell = tf.contrib.rnn.GRUCell(hidden_dims)
        cells.append(cell)

    return cells


def _rnn_reformat(embedded_chars, embedding_dim, n_steps):
    """
    This function reformat input to the shape that standard RNN can take.

    Inputs:
        x    a tensor of shape (batch_size, n_steps, input_dims).
    Outputs:
        x_reformat   a list of 'n_steps' tenosrs, each has shape (batch_size, input_dims).
    """
    # permute batch_size and n_steps
    x_ = tf.transpose(embedded_chars, [1, 0, 2])
    # reshape to (n_steps*batch_size, input_dims)
    x_ = tf.reshape(x_, [-1, embedding_dim])
    # split to get a list of 'n_steps' tensors of shape (batch_size, input_dims)
    x_reformat = tf.split(x_, n_steps, 0)

    return x_reformat


def drnn_classification(embedded_chars,
                        hidden_structs,
                        dilations,
                        n_steps,
                        dropouts,
                        embedding_dim,
                        cell_type="RNN"):

    """This function construct a multilayer dilated RNN for classifiction.
    Inputs:
        x      a tensor of shape (batch_size, n_steps, input_dims).
        hidden_structs   a list, each element indicates the hidden node dimension of each layer.
        dilations   a list, each element indicates the dilation of each layer.
        n_steps    the length of the sequence.
        input_dims     the input dimension.
        cell_type    the type of the RNN cell, should be in ["RNN", "LSTM", "GRU"].

    Outputs:
        pred   the prediction logits at the last timestamp and the last layer of the RNN.
                'pred' does not pass any output activation functions."""

    # error checking
    assert (len(hidden_structs) == len(dilations))

    # reshape inputs
    x_reformat = _rnn_reformat(embedded_chars, embedding_dim, n_steps)

    # construct a list of cells
    cells = _contruct_cells(hidden_structs, cell_type,dropouts)

    # define dRNN structures
    layer_outputs = multi_dRNN_with_dilations(cells, x_reformat, dilations)

    return layer_outputs

#######################################################################################################################
#######################################################################################################################

def attention(inputs, attention_size, time_major=True, return_alphas=False):
    """
    Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.

    The idea was proposed in the article by Z. Yang et al., "Hierarchical Attention Networks
     for Document Classification", 2016: http://www.aclweb.org/anthology/N16-1174.
    Variables notation is also inherited from the article and has been used as inspiration for this attention. 

    Args:
        inputs: The Attention inputs.
            Matches outputs of RNN/Bi-RNN layer (not final state):
                In case of RNN, this must be RNN outputs `Tensor`:
                    If time_major == False (default), this must be a tensor of shape:
                        `[batch_size, max_time, cell.output_size]`.
                    If time_major == True, this must be a tensor of shape:
                        `[max_time, batch_size, cell.output_size]`.
                In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and
                the backward RNN outputs `Tensor`.
                    If time_major == False (default),
                        outputs_fw is a `Tensor` shaped:
                        `[batch_size, max_time, cell_fw.output_size]`
                        and outputs_bw is a `Tensor` shaped:
                        `[batch_size, max_time, cell_bw.output_size]`.
                    If time_major == True,
                        outputs_fw is a `Tensor` shaped:
                        `[max_time, batch_size, cell_fw.output_size]`
                        and outputs_bw is a `Tensor` shaped:
                        `[max_time, batch_size, cell_bw.output_size]`.
        attention_size: Linear size of the Attention weights.
        time_major: The shape format of the `inputs` Tensors.
            If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
            If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
            Using `time_major = True` is a bit more efficient because it avoids
            transposes at the beginning and end of the RNN calculation.  However,
            most TensorFlow data is batch-major, so by default this function
            accepts input and emits output in batch-major form.
        return_alphas: Whether to return attention coefficients variable along with layer's output.
            Used for visualization purpose.
    Returns:
        The Attention output `Tensor`.
        In case of RNN, this will be a `Tensor` shaped:
            `[batch_size, cell.output_size]`.
        In case of Bidirectional RNN, this will be a `Tensor` shaped:
            `[batch_size, cell_fw.output_size + cell_bw.output_size]`.
    """

    inputs = tf.stack(inputs, axis=0)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.transpose(inputs, [1, 0, 2])

    hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

    with tf.name_scope('Variable'):
        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
        # the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)

    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
    alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    if not return_alphas:
        return output
    else:
        return output, alphas


In [None]:
path = '/Folder/your_file.csv'

# Tokenize tweets and map to one_hot labels
x_, y_, vocabulary,vocabulary_inv,dataframe ,labels = load_data(path)
word_embeddings = load_embeddings(vocabulary)
print("===> Loaded Data and Parameters")

# split data
x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.2, random_state=42)
x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.3)

logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))

In [None]:
# configurations
n_steps = x_train.shape[1]
n_classes = 3
vocabulary_size = len(vocabulary)
EMBEDDING_DIM = 250
ATTENTION_SIZE = 250
HIDDEN_SIZE = 250
num_layers = 2
num_hidden = 250
MODEL_PATH = './BLGSN_1000'
DELTA = 0.5
KEEP_PROB = 0.5
batch_size = 64
BATCH_SIZE = 64
NUM_EPOCHS = 10
INDEX_FROM = 0
epochs = 10
dropout_keep_prob=0.5

# model config
cell_type = "LSTM"
assert(cell_type in ["RNN", "LSTM", "GRU"])
hidden_structs = [250,250]
dilations = [1,2]
assert(len(hidden_structs) == len(dilations))

with tf.name_scope('Inputs'):
    x = tf.placeholder(tf.int32, [None, n_steps], name='batch_ph')
    y = tf.placeholder(tf.float32, [None, n_classes], name='target_ph')
    keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph')

with tf.name_scope('Embedding_layer'):
    embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0,1.0), trainable=True)
    embedded_chars = tf.nn.embedding_lookup(embeddings_var, x, max_norm=None, validate_indices=True, partition_strategy='mod')

for layer in range(num_layers):
  cell_fw = tf.contrib.rnn.LSTMCell(num_hidden)
  forward = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=dropout_keep_prob)

  cell_bw = tf.contrib.rnn.LSTMCell(num_hidden)
  backward = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=dropout_keep_prob)

with tf.name_scope('Bi_Directional_Layer'):
            # forward and backward outputs are concatenated
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(forward,backward, inputs=embedded_chars,dtype=tf.float32)
            last_state_fw, last_state_bw = outputs
            add_together = tf.math.add(last_state_fw, last_state_bw)

global_step = tf.Variable(0, name='global_step', trainable=False)
pred = drnn_classification(embedded_chars, hidden_structs, dilations, n_steps, KEEP_PROB, EMBEDDING_DIM,cell_type)

with tf.name_scope('Attention_layer'):
    attention_output, alphas = attention(pred, ATTENTION_SIZE, return_alphas=True)
    tf.summary.histogram('alphas', alphas)
    drop = tf.nn.dropout(attention_output, keep_prob_ph)

with tf.name_scope('Fully_connected_layer'):
    W = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, n_classes], stddev=0.1))
    b = tf.Variable(tf.constant(0., shape=[n_classes]))
    y_hat = tf.nn.xw_plus_b(drop, W, b)
    predictions = tf.argmax(input=y_hat, axis=1, name='predictions')
    print(y_hat)
    probs = tf.nn.softmax(y_hat)
    tf.summary.histogram('W', W)

with tf.name_scope('Loss'):
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=y_hat, labels=y))
    tf.summary.scalar('loss', loss)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

with tf.name_scope('Accuracy'):
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.nn.softmax(y_hat)), y), tf.float32))
    tf.summary.scalar('accuracy', accuracy)

with tf.name_scope('correct_predictions'):
    correct_predictions = tf.equal(tf.argmax(input=y, axis=1), predictions)
    num_correct = tf.reduce_sum(input_tensor=tf.cast(correct_predictions, 'float'),name='correct_predictions')

merged = tf.summary.merge_all()
print("Loaded Model")

In [None]:
# Batch generators
def batch_generator(X, y, batch_size):
    """batch generator"""
    size = X.shape[0]
    X_copy = X.copy()
    y_copy = y.copy()
    indices = np.arange(size)
    np.random.shuffle(indices)
    X_copy = X_copy[indices]
    y_copy = y_copy[indices]
    i = 0
    while True:
        if i + batch_size <= size:
            yield X_copy[i:i + batch_size], y_copy[i:i + batch_size]
            i += batch_size
        else:
            i = 0
            indices = np.arange(size)
            np.random.shuffle(indices)
            X_copy = X_copy[indices]
            y_copy = y_copy[indices]
            continue
            
train_batch_generator = batch_generator(x_train, y_train, batch_size)
test_batch_generator = batch_generator(x_dev, y_dev, batch_size)
predict_generator = batch_generator(x_test, y_test, batch_size)

saver = tf.train.Saver()

with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess:
    sess.run(tf.global_variables_initializer())
    print("Start learning...")
    for epoch in range(epochs):
        loss_train = 0
        loss_val = 0
        loss_test = 0
        accuracy_train = 0
        accuracy_val = 0
        accuracy_test = 0
        train_loss_l = []
        val_loss_l = []

        print("epoch: {}\t".format(epoch), end="")

        # Training
        num_batches = x_train.shape[0] // batch_size
        for b in tqdm(range(num_batches)):
            x_batch, y_batch = next(train_batch_generator)
            loss_tr, acc, _, summary = sess.run([loss, accuracy, optimizer, merged],
                                                feed_dict={x: x_batch,
                                                           y: y_batch,
                                                           keep_prob_ph: 0.5})
            train_loss_l.append(loss_tr)
            accuracy_train += acc
            loss_train = loss_tr * DELTA + loss_train * (1 - DELTA)
        accuracy_train /= num_batches
            
        # Validation
        num_batches = x_dev.shape[0] // batch_size
        for b in tqdm(range(num_batches)):
            x_batch, y_batch = next(test_batch_generator)
            val_loss, val_acc, summary = sess.run([loss, accuracy, merged],
                                                     feed_dict={x: x_batch,
                                                                y: y_batch,
                                                                keep_prob_ph: 1.0})
            val_loss_l.append(val_loss)
            accuracy_val += val_acc
            loss_val += val_loss
        accuracy_val /= num_batches
        loss_val /= num_batches

        print("loss: {:.3f}, val_loss: {:.3f}, acc: {:.3f}, val_acc: {:.3f}".format(
            loss_train, loss_val, accuracy_train, accuracy_val
        ))

    # predict x_test
    num_batches = x_test.shape[0] // batch_size
    predict_correct = 0
    for batch in tqdm(range(num_batches)):
        x_batch, y_batch = next(predict_generator)
        loss_pred, acc_pred, n_correct,y_pred = sess.run([loss,accuracy,num_correct, predictions], 
                                                         feed_dict={x: x_batch,
                                                                    y: y_batch,
                                                                    keep_prob_ph : 0.5 })     
        
    y_true = np.argmax(y_batch,1)
    print(y_true)
    print(y_pred)
        
    print("Precision", sk.metrics.precision_score(y_true, y_pred,average='weighted'))
    print("Recall", sk.metrics.recall_score(y_true, y_pred,average='weighted'))
    print("f1_score", sk.metrics.f1_score(y_true, y_pred,average='weighted'))
    print("confusion_matrix")
    print(sk.metrics.confusion_matrix(y_true, y_pred))
    
  
    # get probability distribution for labels
    num_batches = x_dev.shape[0] // batch_size
    for b in tqdm(range(num_batches)):
        x_batch, y_batch = next(test_batch_generator)
        predictions_2 = sess.run(probs, feed_dict={x: x_batch,
                                                   y: y_batch,
                                                   keep_prob_ph: 1.0})
    
    saver.save(sess, MODEL_PATH)
    print("Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.")
    plt.plot(train_loss_l)
    plt.plot(val_loss_l)
    plt.show()
    