In [46]:
import nltk
import progressbar
import re
import numpy as np
import pandas as pd
import logging
import sys
import keras
from keras.layers import (
    Dense, GRU, TimeDistributed, Input,
    Embedding, Bidirectional, Lambda
)
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from keras import backend as K
from nltk.tokenize import sent_tokenize
import _pickle as cPickle
from keras.models import load_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Mounting Drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
path_dataset = "drive/My Drive/dataset.csv" # path to dataset

In [4]:
dataset = pd.read_csv(path_dataset) # loading dataset

In [5]:
MAX_WORDS_PER_SENT = 50 # maximum limit of words per sentence
MAX_SENT = 40 # maximum number of sentences taken for every case from the end
MAX_VOC_SIZE = 100000 # max vocabularity size
GLOVE_DIM = 180 # dimension of glove embeddings we are feeding to network

In [6]:
dataset2 = dataset # making a copy of the dataset

In [7]:
def remove_quotations(text): # function to remove quotations
    """
    Remove quotations and slashes from the dataset.
    """
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)
    return text

In [8]:
dataset2['text'] = dataset2['text'].apply(remove_quotations) # removing quotations from dataset
dataset2['text'] = dataset2['text'].apply(lambda x: x.strip().lower()) # converting data to lowr case

In [9]:
# splitting data into train,test and val
train = dataset2.loc[dataset2['split'] == 'train']
test = dataset2.loc[dataset2['split'] == 'test']
val = dataset2.loc[dataset2['split'] == 'dev']
text = dataset2['text'].values

In [10]:
word_tokenizer = Tokenizer(num_words=MAX_VOC_SIZE) # creating a word tokenizer
word_tokenizer.fit_on_texts(text) # fitting the tokenizer on out text

In [11]:
# getting the labels and the x values for train,test and val
x_train = train['text'].values
y_train = train['label'].values
x_val = val['text'].values
y_val = val['label'].values
x_test = test['text'].values
y_test = test['label'].values

In [14]:
# tokenizing the cases and taking MAX_SENT sentences from each case with atmost MAX_WORDS_PER_SENT tokens for train data
X_train = np.zeros((len(x_train), MAX_SENT, MAX_WORDS_PER_SENT), dtype='int32')
for i, text in progressbar.progressbar(enumerate(x_train)):
    sentences = sent_tokenize(text)
    tokenized_sentences = word_tokenizer.texts_to_sequences(
        sentences
    )
    tokenized_sentences = pad_sequences(
        tokenized_sentences, maxlen=MAX_WORDS_PER_SENT
    )

    pad_size = MAX_SENT - tokenized_sentences.shape[0]

    if pad_size < 0:
        tokenized_sentences = tokenized_sentences[0:MAX_SENT]
    else:
        tokenized_sentences = np.pad(
            tokenized_sentences, ((0,pad_size),(0,0)),
            mode='constant', constant_values=0
        )

    # Store this observation as the i-th observation in
    # the data matrix
    X_train[i] = tokenized_sentences[None, ...]

| |   #                                           | 32304 Elapsed Time: 0:04:51


In [15]:
# tokenizing the cases and taking MAX_SENT sentences from each case with atmost MAX_WORDS_PER_SENT tokens for validation data
X_val = np.zeros((len(x_val), MAX_SENT, MAX_WORDS_PER_SENT), dtype='int32')
for i, text in progressbar.progressbar(enumerate(x_val)):
    sentences = sent_tokenize(text)
    tokenized_sentences = word_tokenizer.texts_to_sequences(
        sentences
    )
    tokenized_sentences = pad_sequences(
        tokenized_sentences, maxlen=MAX_WORDS_PER_SENT
    )

    pad_size = MAX_SENT - tokenized_sentences.shape[0]

    if pad_size < 0:
        tokenized_sentences = tokenized_sentences[0:MAX_SENT]
    else:
        tokenized_sentences = np.pad(
            tokenized_sentences, ((0,pad_size),(0,0)),
            mode='constant', constant_values=0
        )

    # Store this observation as the i-th observation in
    # the data matrix
    X_val[i] = tokenized_sentences[None, ...]

| |         #                                       | 993 Elapsed Time: 0:00:10


In [16]:
# tokenizing the cases and taking MAX_SENT sentences from each case with atmost MAX_WORDS_PER_SENT tokens for test data
x_test_check = []
X_test = np.zeros((len(x_test), MAX_SENT, MAX_WORDS_PER_SENT), dtype='int32')
for i, text in progressbar.progressbar(enumerate(x_test)):
    sentences = sent_tokenize(text)
    x_test_check.append(sentences[max(-40,-len(sentences)):])
    tokenized_sentences = word_tokenizer.texts_to_sequences(
        sentences
    )
    tokenized_sentences = pad_sequences(
        tokenized_sentences, maxlen=MAX_WORDS_PER_SENT
    )

    pad_size = MAX_SENT - tokenized_sentences.shape[0]

    if pad_size < 0:
        tokenized_sentences = tokenized_sentences[0:MAX_SENT]
    else:
        tokenized_sentences = np.pad(
            tokenized_sentences, ((0,pad_size),(0,0)),
            mode='constant', constant_values=0
        )

    # Store this observation as the i-th observation in
    # the data matrix
    X_test[i] = tokenized_sentences[None, ...]

| |                           #                    | 1516 Elapsed Time: 0:00:16


In [19]:
# converting labels into categorical format
Y_train = to_categorical(y_train)
Y_val = to_categorical(y_val)
Y_test = to_categorical(y_test)

In [21]:
# loading the glove embeddings(trained on our dataset)
path_glove_pretrained_embeddings = 'drive/My Drive/Glove_model/glove_epoch20.model'
with open(path_glove_pretrained_embeddings, 'rb') as f:
  model = cPickle.load(f)

In [22]:
# initializing the embedding matrix
embedding_matrix = np.random.random(
    (len(word_tokenizer.word_index) + 1, GLOVE_DIM)
)

In [23]:
# setting the embedding of pad token
embedding_matrix[0] = 0

In [24]:
# deleting dataset to free space as not required now
del dataset
del dataset2

In [25]:
# creating the embedding matrix
for word, index in word_tokenizer.word_index.items():
  ind = model['dictionary'].get(word)
  if ind is not None:
    embedding_vector = model['word_vectors'][ind]
    embedding_matrix[index] = embedding_vector

In [26]:
embedding_matrix.shape

(299496, 180)

In [28]:
# creating the attention layer for our HAN model
class AttentionLayer(keras.layers.Layer):
    def __init__(self, context_vector_length=100, **kwargs):
        """
        An implementation of a attention layer. This layer
        accepts a 3d Tensor (batch_size, time_steps, input_dim) and
        applies a single layer attention mechanism in the time
        direction (the second axis).
        :param context_vector_lenght: (int) The size of the hidden context vector.
            If set to 1 this layer reduces to a standard attention layer.
        :param kwargs: Any argument that the baseclass Layer accepts.
        """
        self.context_vector_length = context_vector_length
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        dim = input_shape[2]

        # Add a weights layer for the
        self.W = self.add_weight(
            name='W', shape=(dim, self.context_vector_length),
            initializer=keras.initializers.get('uniform'),
            trainable=True
        )

        self.u = self.add_weight(
            name='context_vector', shape=(self.context_vector_length, 1),
            initializer=keras.initializers.get('uniform'),
            trainable=True
        )

        super(AttentionLayer, self).build(input_shape)

    def _get_attention_weights(self, X):
        """
        Computes the attention weights for each timestep in X
        :param X: 3d-tensor (batch_size, time_steps, input_dim)
        :return: 2d-tensor (batch_size, time_steps) of attention weights
        """
        # Compute a time-wise stimulus, i.e. a stimulus for each
        # time step. For this first compute a hidden layer of
        # dimension self.context_vector_length and take the
        # similarity of this layer with self.u as the stimulus
        u_tw = K.tanh(K.dot(X, self.W))
        tw_stimulus = K.dot(u_tw, self.u)

        # Remove the last axis an apply softmax to the stimulus to
        # get a probability.
        tw_stimulus = K.reshape(tw_stimulus, (-1, tw_stimulus.shape[1]))
        att_weights = K.softmax(tw_stimulus)

        return att_weights

    def call(self, X):
        att_weights = self._get_attention_weights(X)

        # Reshape the attention weights to match the dimensions of X
        att_weights = K.reshape(att_weights, (-1, att_weights.shape[1], 1))
        att_weights = K.repeat_elements(att_weights, X.shape[-1], -1)

        # Multiply each input by its attention weights
        weighted_input = keras.layers.Multiply()([X, att_weights])

        # Sum in the direction of the time-axis.
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[2]

    def get_config(self):
        config = {
            'context_vector_length': self.context_vector_length
        }
        base_config = super(AttentionLayer, self).get_config()
        return {**base_config, **config}

In [30]:
# Creating the entire HAN model with attention layer and word and setence encoders
class HAN(Model):
    def __init__(
            self, max_words, max_sentences, output_size,
            embedding_matrix, word_encoding_dim=200,
            sentence_encoding_dim=200, inputs=None,
            outputs=None, name='han-for-docla'
    ):
        """
        A Keras implementation of Hierarchical Attention networks
        for document classification.
        :param max_words: The maximum number of words per sentence
        :param max_sentences: The maximum number of sentences
        :param output_size: The dimension of the last layer (i.e.
            the number of classes you wish to predict)
        :param embedding_matrix: The embedding matrix to use for
            representing words
        :param word_encoding_dim: The dimension of the GRU
            layer in the word encoder.
        :param sentence_encoding_dim: The dimension of the GRU
            layer in the sentence encoder.
        """
        self.max_words = max_words
        self.max_sentences = max_sentences
        self.output_size = output_size
        self.embedding_matrix = embedding_matrix
        self.word_encoding_dim = word_encoding_dim
        self.sentence_encoding_dim = sentence_encoding_dim


        in_tensor, out_tensor = self._build_network()

        super(HAN, self).__init__(
            inputs=in_tensor, outputs=out_tensor, name=name
        )

    def build_word_encoder(self, max_words, embedding_matrix, encoding_dim=200):
        """
        Build the model that embeds and encodes in context the
        words used in a sentence. The return model takes a tensor of shape
        (batch_size, max_length) that represents a collection of sentences
        and returns an encoded representation of these sentences.
        :param max_words: (int) The maximum sentence length this model accepts
        :param embedding_matrix: (2d array-like) A matrix with the i-th row
            representing the embedding of the word represented by index i.
        :param encoding_dim: (int, should be even) The dimension of the
            bidirectional encoding layer. Half of the nodes are used in the
            forward direction and half in the backward direction.
        :return: Instance of keras.Model
        """
        assert encoding_dim % 2 == 0, "Embedding dimension should be even"

        vocabulary_size = embedding_matrix.shape[0]
        embedding_dim = embedding_matrix.shape[1]

        embedding_layer = Embedding(
            vocabulary_size, embedding_dim,
            weights=[embedding_matrix], input_length=max_words,
            trainable=False
        )

        sentence_input = Input(shape=(max_words,), dtype='int32')
        embedded_sentences = embedding_layer(sentence_input)
        encoded_sentences = Bidirectional(
            GRU(int(encoding_dim / 2), return_sequences=True)
        )(embedded_sentences)

        return Model(
            inputs=[sentence_input], outputs=[encoded_sentences], name='word_encoder'
        )

    def build_sentence_encoder(self, max_sentences, summary_dim, encoding_dim=200):
        """
        Build the encoder that encodes the vector representation of
        sentences in their context.
        :param max_sentences: The maximum number of sentences that can be
            passed. Use zero-padding to supply shorter sentences.
        :param summary_dim: (int) The dimension of the vectors that summarizes
            sentences. Should be equal to the encoding_dim of the word
            encoder.
        :param encoding_dim: (int, even) The dimension of the vector that
            summarizes sentences in context. Half is used in forward direction,
            half in backward direction.
        :return: Instance of keras.Model
        """
        assert encoding_dim % 2 == 0, "Embedding dimension should be even"

        text_input = Input(shape=(max_sentences, summary_dim))
        encoded_sentences = Bidirectional(
            GRU(int(encoding_dim / 2), return_sequences=True)
        )(text_input)
        return Model(
            inputs=[text_input], outputs=[encoded_sentences], name='sentence_encoder'
        )

    def _build_network(self):
        """
        Build the graph that represents this network
        :return: in_tensor, out_tensor, Tensors representing the input and output
            of this network.
        """
        in_tensor = Input(shape=(self.max_sentences, self.max_words))

        word_encoder = self.build_word_encoder(
            self.max_words, self.embedding_matrix, self.word_encoding_dim
        )

        word_rep = TimeDistributed(
            word_encoder, name='word_encoder'
        )(in_tensor)

        # Sentence Rep is a 3d-tensor (batch_size, max_sentences, word_encoding_dim)
        sentence_rep = TimeDistributed(
            AttentionLayer(), name='word_attention'
        )(word_rep)

        doc_rep = self.build_sentence_encoder(
            self.max_sentences, self.word_encoding_dim, self.sentence_encoding_dim
        )(sentence_rep)

        # We get the final representation by applying our attention mechanism
        # to the encoded sentences
        doc_summary = AttentionLayer(name='sentence_attention')(doc_rep)

        out_tensor = Dense(
            self.output_size, activation='softmax', name='class_prediction'
        )(doc_summary)

        return in_tensor, out_tensor

    def get_config(self):
        config = {
            'max_words': self.max_words,
            'max_sentences': self.max_sentences,
            'output_size': self.output_size,
            'embedding_matrix': self.embedding_matrix,
            'word_encoding_dim': self.word_encoding_dim,
            'sentence_encoding_dim': self.sentence_encoding_dim,
            'base_config': super(HAN, self).get_config()
        }

        return config

    @classmethod
    def from_config(cls, config, custom_objects=None):
        """
        Keras' API isn't really extendible at this point
        therefore we need to use a bit hacky solution to
        be able to correctly reconstruct the HAN model
        from a config. This therefore does not reconstruct
        a instance of HAN model, but actually a standard
        Keras model that behaves exactly the same.
        """
        base_config = config.pop('base_config')

        return Model.from_config(
            base_config, custom_objects=custom_objects
        )

    def predict_sentence_attention(self, X):
        """
        For a given set of texts predict the attention
        weights for each sentence.
        :param X: 3d-tensor, similar to the input for predict
        :return: 2d array (num_obs, max_sentences) containing
            the attention weights for each sentence
        """
        att_layer = self.get_layer('sentence_attention')
        prev_tensor = att_layer.input

        # Create a temporary dummy layer to hold the
        # attention weights tensor
        dummy_layer = Lambda(
            lambda x: att_layer._get_attention_weights(x)
        )(prev_tensor)

        return Model(self.input, dummy_layer).predict(X)

In [31]:
# deleting the glove model as we have created out embedding matrix
del model

In [32]:
han_model = HAN(
    MAX_WORDS_PER_SENT, MAX_SENT, 2, embedding_matrix,
    word_encoding_dim=100, sentence_encoding_dim=100
)
han_model.summary()

Model: "han-for-docla"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 40, 50)            0         
_________________________________________________________________
word_encoder (TimeDistribute (None, 40, 50, 100)       53978580  
_________________________________________________________________
word_attention (TimeDistribu (None, 40, 100)           10100     
_________________________________________________________________
sentence_encoder (Model)     (None, 40, 100)           45300     
_________________________________________________________________
sentence_attention (Attentio (None, 100)               10100     
_________________________________________________________________
class_prediction (Dense)     (None, 2)                 202       
Total params: 54,044,282
Trainable params: 135,002
Non-trainable params: 53,909,280
___________________________________

In [33]:
# compiling the model
han_model.compile(
    optimizer='adam', loss='categorical_crossentropy',
    metrics=['acc']
)

In [35]:
# a checkpoint to save model after every epoch if val loss decreases
checkpoint_saver = ModelCheckpoint(
    filepath='HAN_Model.{epoch:02d}-{val_loss:.2f}.hdf5',
    verbose=1, save_best_only=True
)

In [None]:
# training the model
han_model.fit(
    X_train, Y_train, batch_size=20, epochs=5,
    validation_data=(X_val, Y_val),
    callbacks=[checkpoint_saver]
)

In [39]:
# defining a function which calculates various metrics such as micro and macro precision, accuracy and f1
def metrics_calculator(preds, test_labels):
    cm = confusion_matrix(test_labels, preds)
    TP = []
    FP = []
    FN = []
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[i][j]

        FN.append(summ)
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[j][i]

        FP.append(summ)
    for i in range(0,2):
        TP.append(cm[i][i])
    precision = []
    recall = []
    for i in range(0,2):
        precision.append(TP[i]/(TP[i] + FP[i]))
        recall.append(TP[i]/(TP[i] + FN[i]))

    macro_precision = sum(precision)/2
    macro_recall = sum(recall)/2
    micro_precision = sum(TP)/(sum(TP) + sum(FP))
    micro_recall = sum(TP)/(sum(TP) + sum(FN))
    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1

In [None]:
# getting the predicted labels on the test data
predict = han_model.predict(X_test, batch_size = 20)
y_pred = predict > 0.5
y = []
for i in range(len(y_pred)):
  y.append(int(y_pred[i][1]))

# Calculating all metrics on test data predicted label
print(metrics_calculator(y, y_test))

In [None]:
# getting the predicted labels on the dev data
preds = han_model.predict(X_val, batch_size= 20)
y_pred_dev = preds > 0.5
y = []
for i in range(len(y_pred_dev)):
  y.append(int(y_pred_dev[i][1]))

# Calculating all metrics on dev data predicted label
print(metrics_calculator(y, y_val))

In [None]:
# saving the final HAN Model
han_model.save('HAN_final.h5')