In [3]:
#IMPORT LIBRARIES, CAN PROBABLY BE MORE CLEAN

import re
from numpy import array
from numpy import asarray
from numpy import zeros
import tensorflow as tf
import keras
import keras.backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, merge, Dropout, Input
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.regularizers import L2
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/yss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
#HELPER FUNCTION FOR READING DATA

def extract(s):
    s = re.sub('\\(', '', s)
    s = re.sub('\\)', '', s)
    s = re.sub('\\s{2,}', ' ', s)
    return s.strip()

labels = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

In [5]:
#READING TRAIN DATA

fileName = 'snli_1.0_train.txt'

with open(fileName, 'r') as f:
    trainRows = [row.split('\t') for row in f.readlines()[1:]]

trainPremises = [extract(row[1]) for row in trainRows if row[0] in labels]
trainHypotheses = [extract(row[2]) for row in trainRows if row[0] in labels]
trainLabels = [labels[row[0]] for row in trainRows if row[0] in labels]

trainData = [trainPremises, trainHypotheses, trainLabels]
f.close()

In [6]:
#READING TEST DATA

fileName = 'snli_1.0_test.txt'

with open(fileName, 'r') as f:
    testRows = [row.split('\t') for row in f.readlines()[1:]]

testPremises = [extract(row[1]) for row in testRows if row[0] in labels]
testHypotheses = [extract(row[2]) for row in testRows if row[0] in labels]
testLabels = [labels[row[0]] for row in testRows if row[0] in labels]

testData = [testPremises, testHypotheses, testLabels]

f.close()

In [7]:
# HYPERPARAMETERS

maxLen = 50
epochs = 10
batchSize = 256
gloveDimension = 100
regularization = 4e-6

In [8]:
# TOKENIZER TO CREATE VOCABULARY

tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainData[0] + trainData[1])
vocabSize = len(tokenizer.word_index)+1

In [9]:
#CONVERT TRAIN DATA TO SEQUENCES AS PER VOCABULARY
#PAD OR TRIM ALL SENTENCES TO SAME LENGTH
#CONVERT LABELS TO ONE HOT ENCODING

trainData[0] = tokenizer.texts_to_sequences(trainData[0])
trainData[1] = tokenizer.texts_to_sequences(trainData[1])
trainData[0] = pad_sequences(trainData[0], maxLen, padding='post')
trainData[1] = pad_sequences(trainData[1], maxLen, padding='post')
trainData[2] = tf.keras.utils.to_categorical(trainData[2], num_classes=3)

In [10]:
#CONVERT TEST DATA TO SEQUENCES AS PER VOCABULARY
#PAD OR TRIM ALL SENTENCES TO SAME LENGTH
#CONVERT LABELS TO ONE HOT ENCODING

testData[0] = tokenizer.texts_to_sequences(testData[0])
testData[1] = tokenizer.texts_to_sequences(testData[1])
testData[0] = pad_sequences(testData[0], maxLen, padding='post')
testData[1] = pad_sequences(testData[1], maxLen, padding='post')
testData[2] = tf.keras.utils.to_categorical(testData[2], num_classes=3)

In [11]:
#LOAD GLOVE FILE AND STORE ALL EMBEDDINGS
#NEEDS TO BE RUN ONLY ONCE
#MAKE SURE TO ADD THE CORRECT FILE NAME FOR GLOVE

embeddingsDict = dict()
glove = open("/home/yss/Documents/sem6/nlp/glove.6B.100d.txt")

for line in glove:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddingsDict[word] = vector_dimensions

glove.close()

In [12]:
#FROM THE EMBEDDINGS, STORE ALL THE ONES THAT ARE IN OUR VOCABULARY

embeddingsMat = zeros((vocabSize, 100))
for word, index in tokenizer.word_index.items():
    vec = embeddingsDict.get(word)
    if vec is not None:
        embeddingsMat[index] = vec

In [14]:
#DEFINE EMBEDDING LAYER FOR MODEL
#SINCE MODEL IS NOT SEQUENTIAL AND DEPENDS ON TWO SEPERATE INPUTS,
#DEFINE TWO INPUTS AND EMBED

embed = Embedding(vocabSize, gloveDimension, weights=[embeddingsMat], input_length=maxLen, trainable=False)

premise = Input(shape=(maxLen,), dtype='int32')
hypothesis = Input(shape=(maxLen,), dtype='int32')

premInput = embed(premise)
hypoInput = embed(hypothesis)

In [15]:
#AS PER THE PAPER, THIS IS THE FIRST TECHNIQUE
#ONCE THE EMBEDDINGS OF A SENTENCE IS THERE, WHAT YOU HAVE IS A
#MATRIX OF maxLen X gloveDimension. ADD ALONG maxLen TO GET
#A SINGLE EMBEDDING VECTOR OF LENGTH gloveDimension

rnn = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1), output_shape=gloveDimension)

In [16]:
#NOT ENTIRELY SURE WHAT BATCH NORMALIZATION DOES
#BUT IT WAS IN THE SOURCE I REFERRED TO
#ALTHOUGH I DIDN'T SEE IT IN THE PAPER

premInput = rnn(premInput)
hypoInput = rnn(hypoInput)
premInput = tf.keras.layers.BatchNormalization()(premInput)
hypoInput = tf.keras.layers.BatchNormalization()(hypoInput)

In [17]:
#JOINT IS THE CONCATENATED LAYER OF THE SUMMED PREMISE AND HYPOTHESIS
#DROPOUTS WERE NOT MENTIONED IN THE PAPER, BUT IS PROBABLY A STANDARD PRACTICE
#THE PAPER HAS 3 LAYERS OF TANH ALONG WITH SOME L2 REGULARIZATION
#AND FINALLY THE DECISION IS BASED ON SOFTMAX


joint = keras.layers.concatenate([premInput, hypoInput])
joint = Dropout(0.2)(joint)
for i in range(3):
    joint = Dense(200, activation='tanh', kernel_regularizer=L2(regularization))(joint)
    joint = Dropout(0.2)(joint)
    joint = tf.keras.layers.BatchNormalization()(joint)

pred = Dense(3, activation='softmax')(joint)

In [18]:
#DEFINING MODEL INPUT AND OUTPUT AND COMPILATION

model = Model(inputs=[premise, hypothesis], outputs=pred)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
#FITTING THE MODEL TO TRAIN DATA

model.fit([array(trainData[0]), array(trainData[1])], array(trainData[2]), batch_size=batchSize, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2915af3e50>

In [20]:
loss, acc = model.evaluate([array(testData[0]), array(testData[1])], array(testData[2]), batch_size=256)
print('Loss = ', loss)
print('Acc = ', acc)

Loss =  0.7859595417976379
Acc =  0.6607288122177124
