#HAN: Hierachical Attention Network for review classification

In this paper I will be implementing a Hierachical Attention Network, a new architecture that has shown promising results in sequence classification tasks. 

It uses stacked BiLSTM networks on words followed by an attention model to extract such words that are important to the meaning of the sentence and aggregate the representation of those informative words to form a sentence vector. Then the same procedure applied to the derived sentence vectors which then generate a vector who conceives the meaning of the given document and that vector can be passed further for text classification.

The illustration of this structure is below. ![](https://cdn-images-1.medium.com/max/1600/1*28XVtq2lOjOmZhcSgu1NmQ.png)

The idea behind it is that words make sentences and sentences make reviews. The attention layer decides which words are important for sentences and which sentences are important for reviews. 

In [1]:
!pip install keras==2.0.3
import numpy as np
import pandas as pd
from collections import defaultdict
import re

import sys
import os

import keras
import tensorflow as tf

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

import nltk
nltk.download('punkt')
from nltk import tokenize



Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2


In [3]:
def download_and_load_dataset(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="yelpZIP.txt", 
      origin="https://storage.googleapis.com/lucas0/yelpZIP.txt", 
      extract=False)
  dfile = open(dataset).readlines()
  return dfile

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

reviews = download_and_load_dataset()

data = {}
data['review'] = []
data['deceptive'] = []

for x in reviews:
  x = eval(x)
  data['review'].append(x[0])
  data['deceptive'].append(1 if x[1] else 0)

dataDict = pd.DataFrame.from_dict(data)
print(dataDict.shape)

reviews = []
sentences = []
labels = []

for idx in range(dataDict.shape[0]):
  text = dataDict.review[idx]
  text = clean_str(text)
  reviews.append(text)
  sentence = tokenize.sent_tokenize(text)
  sentences.append(sentence)

  labels.append(dataDict.deceptive[idx])

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(reviews)

(160933, 2)


In [4]:
data = np.zeros((len(reviews), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentence in enumerate(sentences):
    for j, sent in enumerate(sentence):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
                    data[i, j, k] = tokenizer.word_index[word]
                    k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of genuine and deceptive reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))


Total 90929 unique tokens.
Shape of data tensor: (160933, 15, 100)
Shape of label tensor: (160933,)
Number of genuine and deceptive reviews in traing and validation set
64273
16193


In [0]:
import gensim
tf.keras.utils.get_file(
      fname="GoogleNews-vectors-negative300.bin", 
      origin="https://storage.googleapis.com/lucas0/GoogleNews-vectors-negative300.bin", 
      extract=False)
word_vectors = gensim.models.KeyedVectors.load_word2vec_format("~/.keras/datasets/GoogleNews-vectors-negative300.bin", binary=True)

In [0]:
embedding_length = word_vectors.vector_size
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, index in word_index.items():
  if word in word_vectors.vocab:
    embedding_matrix[index] = np.array(word_vectors[word], dtype=np.float32)

In [0]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=False,
                            mask_zero=True)

In [0]:
class AttLayer(Layer):
    def __init__(self, attention_dim):
        self.init = initializers.get('normal')
        self.supports_masking = True
        self.attention_dim = attention_dim
        super(AttLayer, self).__init__()

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
        self.b = K.variable(self.init((self.attention_dim, )))
        self.u = K.variable(self.init((self.attention_dim, 1)))
        self.trainable_weights = [self.W, self.b, self.u]
        super(AttLayer, self).build(input_shape)

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, mask=None):
        # size of x :[batch_size, sel_len, attention_dim]
        # size of u :[batch_size, attention_dim]
        # uit = tanh(xW+b)
        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)

        ait = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            ait *= K.cast(mask, K.floatx())
        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = x * ait
        output = K.sum(weighted_input, axis=1)

        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])


In [9]:
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_att = AttLayer(100)(l_lstm)
sentEncoder = Model(sentence_input, l_att)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [0]:
review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
l_att_sent = AttLayer(100)(l_lstm_sent)
preds = Dense(1, activation='sigmoid')(l_att_sent)
model = Model(review_input, preds)

In [14]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 15, 100)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 15, 200)           27539800  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 15, 200)           180600    
_________________________________________________________________
att_layer_3 (AttLayer)       (None, 200)               20200     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 201       
Total params: 27,740,801
Trainable params: 461,801
Non-trainable params: 27,279,000
_________________________________________________________________


In [15]:
print("model fitting - Hierachical attention network")
model.fit(x_train, np.array(y_train), validation_data=(x_val, np.array(y_val)),
          epochs=20, batch_size=64)

model fitting - Hierachical attention network
Instructions for updating:
Use tf.cast instead.
Train on 128747 samples, validate on 32186 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3694ca2cc0>