First check the following dependencies: 

conda install theano pygpu

Replace $HOME/.theanorc with this:

[global]

floatX = float32

device = gpu0

[lib]

gpuarray.preallocate=1


In [1]:
import os 
os.environ['THEANO_FLAGS'] = 'floatX=float32,device=gpu0'
os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda-8.0/bin'
import theano
print(theano.config.device) 

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5110)


gpu0


In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict
import os 
os.environ['KERAS_BACKEND'] = 'theano'
import subprocess
import time

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.optimizers import SGD

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model

from keras.utils import get_file
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, regularizers, optimizers
from keras.callbacks import History, CSVLogger

Using Theano backend.


In [3]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 30
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 200
MAX_VOCAB_SIZE = 50000

In [35]:
trainFile = '/home/anargyri/.keras/datasets/amazon_reviews_train.csv'

# read
train_data = pd.read_csv(trainFile, header=None, names=['rating', 'title', 'text'])

In [36]:
import nltk 

nltk.download('punkt')

reviews = []
labels = []
texts = []

for idx in range(train_data.shape[0]):
    text = train_data['text'][idx]
    texts.append(text)
    sentences = nltk.tokenize.sent_tokenize(text)
    reviews.append(sentences)
    labels.append(train_data['rating'][idx])

[nltk_data] Downloading package punkt to /home/anargyri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

In [45]:
data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
doc_lst = []

# keep the MAX_NB_WORDS most frequent words and replace the rest with 'UNK'
# truncate to the first MAX_SENTS sentences per doc and MAX_SENT_LENGTH words per sentence

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            words_in_sent = []
            for _, word in enumerate(wordTokens):
                if k < MAX_SENT_LENGTH: 
                    if (word in tokenizer.word_index) and (tokenizer.word_index[word] < MAX_NB_WORDS):
                        data[i, j, k] = tokenizer.word_index[word]
                        words_in_sent.append(word)
                    else:
                        data[i, j, k] = MAX_NB_WORDS
                        words_in_sent.append('UNK')
                    k = k + 1
            doc_lst.append(words_in_sent)

In [59]:
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
x_train = data
y_train = labels[:,1:]

print('Shape of data tensor:', x_train.shape)
print('Shape of label tensor:', y_train.shape)

Total 951656 unique tokens.
Shape of data tensor: (3000000, 30, 100)
Shape of label tensor: (3000000, 5)


In [8]:
print('Number of reviews by class in training set')
print(y_train.sum(axis=0))
n_classes = y_train.shape[1]

Number of reviews by class in training set
[ 600000.  600000.  600000.  600000.  600000.]


In [43]:
np.save('/data/tmp/x_train', x_train)
np.save('/data/tmp/y_train', y_train)
np.save('/data/tmp/reviews', reviews)

In [62]:
# train word2vec on the sentences to initialize the word embedding 
import gensim, logging

In [63]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# use skip-gram
word2vec_model = gensim.models.Word2Vec(doc_lst, min_count=6, size=EMBEDDING_DIM, max_vocab_size=MAX_VOCAB_SIZE, sg=1, workers=os.cpu_count())

2017-08-20 12:50:59,261 : INFO : collecting all words and their counts
2017-08-20 12:50:59,263 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-20 12:50:59,309 : INFO : PROGRESS: at sentence #10000, processed 164375 words, keeping 9743 word types
2017-08-20 12:50:59,346 : INFO : PROGRESS: at sentence #20000, processed 329372 words, keeping 12860 word types
2017-08-20 12:50:59,383 : INFO : PROGRESS: at sentence #30000, processed 493998 words, keeping 14628 word types
2017-08-20 12:50:59,424 : INFO : PROGRESS: at sentence #40000, processed 658315 words, keeping 15832 word types
2017-08-20 12:50:59,461 : INFO : PROGRESS: at sentence #50000, processed 821159 words, keeping 16584 word types
2017-08-20 12:50:59,500 : INFO : PROGRESS: at sentence #60000, processed 983885 words, keeping 17225 word types
2017-08-20 12:50:59,538 : INFO : PROGRESS: at sentence #70000, processed 1150118 words, keeping 17756 word types
2017-08-20 12:50:59,576 : INFO : PROGRESS: at 

In [78]:
embeddings_index = {}

for word in word2vec_model.wv.vocab:
    coefs = np.asarray(word2vec_model.wv[word], dtype='float32')
    embeddings_index[word] = coefs

print('Total %s word vectors.' % len(embeddings_index))

Total 20000 word vectors.


In [86]:
# Initial embedding
embedding_matrix = np.zeros((MAX_NB_WORDS + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None and i < MAX_NB_WORDS:
        embedding_matrix[i] = embedding_vector
    elif i == MAX_NB_WORDS:
        # index MAX_NB_WORDS in data corresponds to 'UNK'
        embedding_matrix[i] = embeddings_index['UNK']

In [98]:
np.save('/data/tmp/embedding_matrix', embedding_matrix)

In [5]:
# building Hierachical Attention network

REG_PARAM = 1e-10
l2_reg = regularizers.l2(REG_PARAM)

embedding_layer = Embedding(MAX_NB_WORDS + 1,
                            EMBEDDING_DIM,
                            input_length=MAX_SENT_LENGTH,
                            trainable=True,
                            mask_zero=True,
                            embeddings_regularizer=l2_reg,
                            weights=[embedding_matrix])

In [6]:
CONTEXT_DIM = 100

class AttLayer(Layer):
    def __init__(self, regularizer=None, **kwargs):
        self.regularizer = regularizer
        self.supports_masking = True
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3        
        self.W = self.add_weight(name='W', shape=(input_shape[-1], CONTEXT_DIM), initializer='normal', trainable=True, 
                                 regularizer=self.regularizer)
        self.b = self.add_weight(name='b', shape=(CONTEXT_DIM,), initializer='normal', trainable=True, 
                                 regularizer=self.regularizer)
        self.u = self.add_weight(name='u', shape=(CONTEXT_DIM,), initializer='normal', trainable=True, 
                                 regularizer=self.regularizer)        
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.dot(K.tanh(K.dot(x, self.W) + self.b), self.u)
        ai = K.exp(eij)
        alphas = ai / K.sum(ai, axis=1).dimshuffle(0, 'x')
        if mask is not None:
            # use only the inputs specified by the mask
            alphas *= mask
        weighted_input = x * alphas.dimshuffle(0, 1, 'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
    
    def get_config(self):
        config = {}
        base_config = super(AttLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_mask(self, inputs, mask):
        return None

In [9]:
GPU_IMPL = 2          # for more efficient RNN implementation on GPU
GRU_UNITS = 50        # dimensionality of GRU output

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(GRU_UNITS, return_sequences=True, kernel_regularizer=l2_reg, implementation=GPU_IMPL))(embedded_sequences)
l_att = AttLayer(regularizer=l2_reg)(l_lstm)            
sentEncoder = Model(sentence_input, l_att)

review_input = Input(shape=(MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(GRU_UNITS, return_sequences=True, kernel_regularizer=l2_reg, implementation=GPU_IMPL))(review_encoder)
l_att_sent = AttLayer(regularizer=l2_reg)(l_lstm_sent)       
preds = Dense(n_classes, activation='softmax', kernel_regularizer=l2_reg)(l_att_sent)
model = Model(review_input, preds)

In [10]:
model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.SGD(lr=0.01, momentum=0.9),
              metrics=['acc'])

In [11]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 30, 100)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 30, 100)           4085700   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 30, 100)           45300     
_________________________________________________________________
att_layer_4 (AttLayer)       (None, 100)               10200     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 4,141,705
Trainable params: 4,141,705
Non-trainable params: 0
_________________________________________________________________


In [12]:
history = History()
csv_logger = CSVLogger('./hatt_model_' + str(REG_PARAM) + '_log',
                       separator=',',
                       append=True)

In [13]:
# Order training data by the number of sentences in document (as suggested in the [Yang et al.] paper) 
doc_lengths = [len(r) for r in reviews]
ind = np.argsort(doc_lengths)

In [None]:
t1 = time.time()

print("model fitting - Hierachical attention network")
model.fit(x_train[ind,:,:], y_train[ind,:], epochs=10, batch_size=64, shuffle=False, 
          callbacks=[history, csv_logger], verbose=2)

t2 = time.time()

model fitting - Hierachical attention network
Epoch 1/10


In [22]:
# save model
model.save('./hatt_model_{}.h5'.format(REG_PARAM))

In [23]:
np.savetxt('./hatt_model_{}_time.txt'.format(REG_PARAM), [REG_PARAM, (t2-t1) / 3600])
with open('./hatt_model_{}_history.txt'.format(REG_PARAM), "w") as res_file:
    res_file.write(str(history.history))