### [SICK Training data](http://www.site.uottawa.ca/~diana/csi5386/A2_2019/SICK_train.txt)

##### max length

In [2]:
import numpy as np
import csv
file_name="data/training.txt"
with open(file_name,"r") as data:
    train = csv.DictReader(data , delimiter='\t')
    max_evi, max_hyp = 0, 0 
    count = 1
    for row in train:
        hyp = len(row["sentence_A"].split())
        if hyp > max_hyp:
            max_hyp = hyp
        evi = len(row["sentence_B"].split())
        if evi > max_evi:
            max_evi = evi
    print("Max_hyp %s" % str(max_hyp))        
    print("Max_evi %s" % str(max_evi))            

Max_hyp 28
Max_evi 32


##### Embeddings

In [18]:
glove_zip_file = "data/glove.6B.zip"
glove_vectors_file = "data/glove.6B.300d.txt"

In [15]:
import zipfile, urllib.request, shutil, os
    
#large file - 862 MB
if (not os.path.isfile(glove_zip_file) and
    not os.path.isfile(glove_vectors_file)):
    with urllib.request.urlopen("http://nlp.stanford.edu/data/glove.6B.zip") as response, open(glove_zip_file, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)

In [19]:
import zipfile
def unzip_single_file(zip_file_name, output_file_name):
    """
        If the outFile is already created, don't recreate
        If the outFile does not exist, create it from the zipFile
    """
    if not os.path.isfile(output_file_name):
        with open(output_file_name, 'wb') as out_file:
            with zipfile.ZipFile(zip_file_name) as zipped:
                for info in zipped.infolist():
                    if output_file_name in info.filename:
                        with zipped.open(info) as requested_file:
                            out_file.write(requested_file.read())
                            return

unzip_single_file(glove_zip_file, glove_vectors_file)

In [20]:
glove_wordmap = {}
with open(glove_vectors_file, "r", encoding="utf8") as glove:
    for line in glove:
        name, vector = tuple(line.split(" ", 1))
        glove_wordmap[name] = np.fromstring(vector, sep=" ")

### Imports

In [1]:
import numpy as np

np.random.seed(1337)  # for reproducibility
import os
from keras.regularizers import l2
from keras.callbacks import *
# from visualizer import *
from keras.models import *
from keras.optimizers import *
from keras import utils
from keras.utils.np_utils import *
from keras.layers.core import *
from keras.layers import Input, Embedding, LSTM, Dense, merge, TimeDistributed
from keras.preprocessing.sequence import pad_sequences
# from keras.utils.visualize_util import plot  # THIS IS BAD
# from data_reader import *
import logging
from datetime import datetime

Using TensorFlow backend.


### Callback

In [2]:
class AccCallBack(Callback):
    def __init__(self, xtrain, ytrain, xdev, ydev, xtest, ytest, vocab, opts):
        self.xtrain = xtrain
        self.ytrain = ytrain
        self.xdev = xdev
        self.ydev = ydev
        self.xtest = xtest
        self.ytest = ytest
        self.vocab = vocab
        self.opts = opts

    def on_epoch_end(self, epoch, logs={}):
        train_acc = compute_acc(self.xtrain, self.ytrain, self.vocab, self.model, self.opts)
        dev_acc = compute_acc(self.xdev, self.ydev, self.vocab, self.model, self.opts)
        test_acc = compute_acc(self.xtest, self.ytest, self.vocab, self.model, self.opts)
        logging.info('----------------------------------')
        logging.info('Epoch ' + str(epoch) + ' train loss:' + str(logs.get('loss')) + ' - Validation loss: ' + str(
            logs.get('val_loss')) + ' train acc: ' + str(train_acc[0]) + '/' + str(train_acc[1]) + ' dev acc: ' + str(
            dev_acc[0]) + '/' + str(dev_acc[1]) + ' test acc: ' + str(test_acc[0]) + '/' + str(test_acc[1]))
        logging.info('----------------------------------')

##### acc


In [34]:
def compute_acc(X, Y, vocab, model, opts):
    scores = model.predict(X, batch_size=opts['batch_size'])
    prediction = np.zeros(scores.shape)
    for i in range(scores.shape[0]):
        l = np.argmax(scores[i])
        prediction[i][l] = 1.0
    assert np.array_equal(np.ones(prediction.shape[0]), np.sum(prediction, axis=1))
    plabels = np.argmax(prediction, axis=1)
    tlabels = np.argmax(Y, axis=1)
    acc = sum([1 if x==y else 0 for x,y in list(zip(tlabels, plabels))])/len(tlabels)
    return acc, acc

### Model

In [3]:
def get_H_n(X):
    ans = X[:, -1, :]  # get last element from time dim
    return ans


def get_Y(X, xmaxlen):
    return X[:, :xmaxlen, :]  # get first xmaxlen elem from time dim


def get_R(X):
    Y, alpha = X[0], X[1]
    ans = K.T.batched_dot(Y, alpha)
    return ans


In [14]:
def build_model(opts):
    k = 2 * opts['lstm_units']  # 200
    L = opts['xmaxlen']  # 35
    N = opts['xmaxlen'] + opts['ymaxlen']
    
    main_input = Input(shape=(N,), dtype='int32', name='main_input') #(N,70)
    x = Embedding(output_dim=opts['emb'], input_dim=len(VOCABULARY.keys())+1, input_length=N, name='x')(main_input)
    drop_out = Dropout(0.1, name='dropout')(x) # 70,50
    lstm_fwd = LSTM(opts['lstm_units'], return_sequences=True, name='lstm_fwd')(drop_out)
    lstm_bwd = LSTM(opts['lstm_units'], return_sequences=True, go_backwards=True, name='lstm_bwd')(drop_out)
    #70,100
    bilstm = merge([lstm_fwd, lstm_bwd], name='bilstm', mode='concat')
    #70,200
    drop_out = Dropout(0.1, name="d_bilstm")(bilstm)
    h_n = Lambda(get_H_n, output_shape=(k,), name="h_n")(drop_out)
    #200
    
    
    Y = Lambda(get_Y, arguments={"xmaxlen": L}, name="Y", output_shape=(L, k))(drop_out)
    #35,200
    Whn = Dense(k, W_regularizer=l2(0.01), name="Wh_n")(h_n) #200
    Whn_x_e = RepeatVector(L, name="Wh_n_x_e")(Whn)#35,200
    

    
    WY = TimeDistributed(Dense(k, W_regularizer=l2(0.01)), name="WY")(Y)#35,200
    merged = merge([Whn_x_e, WY], name="merged", mode='sum')
    M = Activation('tanh', name="M")(merged)
    #35,200

    alpha_ = TimeDistributed(Dense(k, activation='linear'), name="alpha_")(M)

    flat_alpha = Flatten(name="flat_alpha")(alpha_)
    alpha = Dense(L, activation='softmax', name="alpha")(flat_alpha) #35,200 Dense_33
    
    alpha = RepeatVector(k, name="alpha_rep")(alpha)
    
    
    Y_trans = Permute((2, 1), name="y_trans")(Y)  # of shape (None,200,35)
    
    r = merge([Y_trans, alpha], output_shape=(k, 1), name="r_")#, mode=get_R)
    #200,35

#     r = Reshape((k,), name="r")(r_)
    Wr = Dense(L, W_regularizer=l2(0.01), name="Dense_Wr")(r) #200,35
    Wh = Dense(k, W_regularizer=l2(0.01), name="Dense_Wh")(Whn_x_e)
    Wh = Permute((2, 1), name="Wh_trans")(Wh)
    
    merged = merge([Wr, Wh], mode='sum')
    h_star = Activation('tanh')(merged)
    
    flat_h_star = Flatten(name="flat_h_star")(h_star)
    out = Dense(3, activation='softmax')(flat_h_star)
    
    
    output = out
    model = Model(input=[main_input], output=output)
    model.summary()
    # plot(model, 'model.png')
    # # model.compile(loss={'output':'binary_crossentropy'}, optimizer=Adam())
    # model.compile(loss={'output':'categorical_crossentropy'}, optimizer=Adam(options.lr))
    model.compile(loss='categorical_crossentropy',optimizer=Adam(opts['lr']))
    return model

# build_model(opts)

### Model - Load & save

In [6]:
def save_model(model, wtpath, archpath, mode='yaml'):
    if mode == 'yaml':
        yaml_string = model.to_yaml()
        open(archpath, 'w').write(yaml_string)
    else:
        with open(archpath, 'w') as f:
            f.write(model.to_json())
    model.save_weights(wtpath)


def load_model(wtpath, archpath, mode='yaml'):
    if mode == 'yaml':
        model = model_from_yaml(open(archpath).read())  # ,custom_objects={"MyEmbedding": MyEmbedding})
    else:
        with open(archpath) as f:
            model = model_from_json(f.read())  # , custom_objects={"MyEmbedding": MyEmbedding})
    model.load_weights(wtpath)
    return model

### Load data

In [7]:
# one hot encoding
def score_setup(row):
    convert_dict = {
      'ENTAILMENT': 0,
      'NEUTRAL': 1,
      'CONTRADICTION': 2
    }
    score = np.zeros((3,))
    tag = row["entailment_judgment"]
    score[convert_dict[tag]] += 1
    return score
#     return convert_dict[row["entailment_judgment"]]

VOCABULARY = {'unk':0}

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()


def split_data_into_scores(max_hyp, max_evi, file_name="data/training.txt"):

    global VOCABULARY, tokenizer
    import csv
    with open(file_name,"r") as data:
        train = csv.DictReader(data , delimiter='\t')
        evi_sentences = np.empty((0,max_evi))
        hyp_sentences = np.empty((0,max_hyp))
        labels = []
        scores = []
        count = 1
        for row in train:
            hyp = row["sentence_A"].lower()
            evi = row["sentence_B"].lower()
            tokenizer.fit_on_texts([hyp])
            tokenizer.fit_on_texts([evi])
            hyp_seq = np.array(tokenizer.texts_to_sequences([hyp])[0])
            
            padded_hyp = np.pad(hyp_seq,
                                (max_hyp-np.shape(hyp_seq)[0],0),
                                       'constant',
                                       constant_values=(0,))
            hyp_sentences = np.append(hyp_sentences, [padded_hyp], axis=0)
            count += 1
            
            evi_seq = np.array(tokenizer.texts_to_sequences([evi])[0])
            padded_evi = np.pad(evi_seq,
                               (max_evi-np.shape(evi_seq)[0],0),
                                'constant',
                                constant_values=(0,))
            evi_sentences = np.append(evi_sentences, [padded_evi], axis=0)
            labels.append(row["entailment_judgment"])
            scores.append(score_setup(row))
        print("Vocabulary size: %s" % str(len(tokenizer.word_counts.keys())+1))
        VOCABULARY = tokenizer.word_index
        VOCABULARY['unk'] = 0
        print(np.shape(hyp_sentences))
        print(np.shape(evi_sentences))
        return hyp_sentences, evi_sentences, np.array(scores)

#### tokenizer scratch

In [8]:
# from keras.preprocessing.text import Tokenizer
# t = Tokenizer()
# fit_text = ["The earth is an awesome place live"]
# t.fit_on_texts(fit_text)
# print(t.word_index)
# fit_text = ["Ana has apples"]
# t.fit_on_texts(fit_text)
# print(t.texts_to_sequences(["Ana is an awesome apple"]))
# # 
# print(len(t.word_index.keys()))
# # from keras.utils.np_utils import to_categorical
# # print(to_categorical([1], num_classes=3))

In [9]:
# # from keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences
# import numpy as np
# a=np.empty((0,))
# for i in range(0,10):
#     a = np.append(a, [1,2,3], axis=0)
# print(a)

# print(np.pad(a, (0,0), 'constant', constant_values=(0,)))

### Main

#### Model + options

In [41]:
opts = {
    'lstm_units': 150,
    'xmaxlen': 35,
    'ymaxlen': 35,
    'emb': 100, #dimension of the embedding
    'max_features': len(VOCABULARY.keys())+1, #vocabulary dim+1
    'batch_size': 128,
    'lr': 0.005,
    'epochs': 100
}

#### Model

In [15]:
model = build_model(opts)

  if sys.path[0] == '':


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 70)            0                                            
____________________________________________________________________________________________________
x (Embedding)                    (None, 70, 50)        116350      main_input[0][0]                 
____________________________________________________________________________________________________
dropout (Dropout)                (None, 70, 50)        0           x[0][0]                          
____________________________________________________________________________________________________
lstm_fwd (LSTM)                  (None, 70, 100)       60400       dropout[0][0]                    
___________________________________________________________________________________________



#### Data manipulation

In [12]:
X_train,Y_train,Z_train=split_data_into_scores(opts['xmaxlen'], opts['ymaxlen'], "data/training.txt")
print(np.shape(X_train))
print(X_train[:1])

print("out")
print(X_train[0])
xy_train = np.concatenate((X_train, Y_train), axis=1)
print("***")
print(np.shape(xy_train))
print("***")

X_dev,Y_dev,Z_dev=split_data_into_scores(opts['xmaxlen'], opts['ymaxlen'], "data/dev.txt")
xy_dev = np.concatenate((X_dev, Y_dev), axis=1)
print("***")
print(np.shape(xy_dev))
print("***")

X_test,Y_test,Z_test=split_data_into_scores(opts['xmaxlen'], opts['ymaxlen'], "data/test_labeled.txt")
xy_test = np.concatenate((X_test, Y_test), axis=1)

train_dict = {'input': xy_train, 'output': Z_train}
dev_dict = {'input': xy_dev, 'output': Z_dev}

Vocabulary size: 2185
(4500, 35)
(4500, 35)
(4500, 35)
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
   4.  5. 13.  2.  6.  3.  1.  7.  8. 14. 15.  9.  2. 10.  3. 11. 12.]]
out
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  4.  5. 13.  2.  6.  3.  1.  7.  8. 14. 15.  9.  2. 10.  3. 11. 12.]
***
(4500, 70)
***
Vocabulary size: 2217
(500, 35)
(500, 35)
***
(500, 70)
***
Vocabulary size: 2326
(4927, 35)
(4927, 35)


#### Train

In [16]:
# history = model.fit(xy_train[:2],Z_train[:2])#,
#                             batch_size=opts['batch_size'])
#                             epochs=opts['epochs'])

Epoch 1/1


In [42]:
history = model.fit(xy_train,Z_train,
                            batch_size=opts['batch_size'],
                            epochs=opts['epochs'],
                            validation_data=(xy_dev, Z_dev),
                            callbacks=[
                                AccCallBack(xy_train, Z_train, xy_dev, Z_dev, xy_test, Z_test, VOCABULARY, opts),
                                EarlyStopping()
                            ]
                            )

train_acc = compute_acc(xy_train, Z_train, VOCABULARY, model, opts)
dev_acc = compute_acc(xy_dev, Z_dev, VOCABULARY, model, opts)
test_acc = compute_acc(xy_test, Z_test, VOCABULARY, model, opts)
print(train_acc)
print(dev_acc)
print(test_acc)

opts_name = "opts-1"
save_model(model, 'model_weights-%s-%s.weights' % (str(opts_name), str(test_acc[0])),
           'model_arch_att-%s-%s.yaml' % (str(opts_name), str(test_acc[0])))
with open(opts_name, "w") as f:
    f.write(str(opts))

Train on 4500 samples, validate on 500 samples
Epoch 1/1
(0.5635555555555556, 0.5635555555555556)
(0.564, 0.564)
(0.5668763953724376, 0.5668763953724376)


In [37]:
# load_model('model_weights.weights', 'model_arch_att.yaml')
train_acc = compute_acc(xy_train, Z_train, VOCABULARY, model, opts)
dev_acc = compute_acc(xy_dev, Z_dev, VOCABULARY, model, opts)
test_acc = compute_acc(xy_test, Z_test, VOCABULARY, model, opts)

print(train_acc)
print(dev_acc)
print(test_acc)

(0.5635555555555556, 0.5635555555555556)
(0.564, 0.564)
(0.5668763953724376, 0.5668763953724376)
