# Imports

In [None]:
import time
import os
import numpy as np
import google.colab as colab
import random
import json
%matplotlib inline
import matplotlib.pyplot as plt
from multiprocessing import Pool
import shutil
from pprint import pprint
import pickle
from random import randint
import pandas as pd

import re
import inspect
from torch import optim
from torch.autograd import Variable
import torch.nn as nn

import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Mount Google Drive

In [None]:
def mount_google_drive():
	'''
	# Functionality
		Mount google drive. Since colab does not save files, we want to make it easier to directly access files in google drive.
	# Arguments
		Nothing
	# Returns
		drive_root: the working directory mounted
	'''
	mount_directory = "/content/gdrive"
	drive = colab.drive
	drive.mount(mount_directory, force_remount=True)
	drive_root = mount_directory + "/" + list(filter(lambda x: x[0] != '.', os.listdir(mount_directory)))[0]
	return drive_root

In [None]:
# Please Set up mounted directories here. Notice whether you want to balance dataset
ROOT_DIR =  mount_google_drive()

DATASET_PATH = ROOT_DIR + "/toy-dataset/quora.csv"

NLI_NET_DIR = ROOT_DIR + "/models/NliNetUtils/"

CHECKPOINT_DIR = ROOT_DIR + "/checkpoints/e2e_SNLI/"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
# Migrate utils from drive to current dir so that we don't need to upload a folder from local every time
shutil.rmtree('utils/', ignore_errors=True)
_ = shutil.copytree(ROOT_DIR +"/utils/", "utils/")

In [None]:
# Load custimizable utils here
from utils.file_utils import *
from utils.image_utils import *
from utils.generator_utils import *
from utils.tqdm_utils import *
from utils.keras_utils import *

Using TensorFlow backend.


In [None]:
# Load infersent model related files
shutil.rmtree('models.py', ignore_errors=True)
shutil.copy(NLI_NET_DIR + "models.py", "models.py")

shutil.rmtree('data.py', ignore_errors=True)
shutil.copy(NLI_NET_DIR + "data.py", "data.py")

shutil.rmtree('mutils.py', ignore_errors=True)
shutil.copy(NLI_NET_DIR + "mutils.py", "mutils.py")

shutil.rmtree('fastText/', ignore_errors=True)
shutil.copytree(ROOT_DIR + "/toy-dataset/fastText/", "fastText/")


'fastText/'

In [None]:
from data import get_nli, get_batch, build_vocab
from mutils import get_optimizer
from models import NLINet

In [None]:
def get_optimizer(s):
    """
    Parse optimizer parameters.
    Input should be of the form:
        - "sgd,lr=0.01"
        - "adagrad,lr=0.1,lr_decay=0.05"
    """
    if "," in s:
        method = s[:s.find(',')]
        optim_params = {}
        for x in s[s.find(',') + 1:].split(','):
            split = x.split('=')
            assert len(split) == 2
            assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
            optim_params[split[0]] = float(split[1])
    else:
        method = s
        optim_params = {}

    if method == 'adadelta':
        optim_fn = optim.Adadelta
    elif method == 'adagrad':
        optim_fn = optim.Adagrad
    elif method == 'adam':
        optim_fn = optim.Adam
    elif method == 'adamax':
        optim_fn = optim.Adamax
    elif method == 'asgd':
        optim_fn = optim.ASGD
    elif method == 'rmsprop':
        optim_fn = optim.RMSprop
    elif method == 'rprop':
        optim_fn = optim.Rprop
    elif method == 'sgd':
        optim_fn = optim.SGD
        assert 'lr' in optim_params
    else:
        raise Exception('Unknown optimization method: "%s"' % method)

    # check that we give good parameters to the optimizer
    expected_args = inspect.getargspec(optim_fn.__init__)[0]
    assert expected_args[:2] == ['self', 'params']
    if not all(k in expected_args[2:] for k in optim_params.keys()):
        raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
            str(expected_args[2:]), str(optim_params.keys())))

    return optim_fn, optim_params

# Look At Your Data First

In [None]:
def preprocess_df(quora_df, balance=True):  
  quora_df['question1'] = quora_df['question1'].astype(str) 
  quora_df['question2'] = quora_df['question2'].astype(str)
  if not balance:
    premise_sents = quora_df['question1'].tolist()
    hypothesis_sents = quora_df['question2'].tolist()
    labels = quora_df['is_duplicate'].tolist() 
  else:
    diff_df = quora_df.loc[quora_df['is_duplicate'] == 0]
    simi_df = quora_df.loc[quora_df['is_duplicate'] == 1]
    
    balance_size = min(diff_df.shape[0], simi_df.shape[0])  
    updated_diff_df = diff_df.sample(n=balance_size, random_state = 517)
    
    # No need to shuffle here because train_test_split will take care of it
  
    premise_sents = simi_df['question1'].tolist()
    premise_sents += updated_diff_df['question1'].tolist()
    
    hypothesis_sents = simi_df['question2'].tolist()
    hypothesis_sents += updated_diff_df['question2'].tolist()
    
    labels = simi_df['is_duplicate'].tolist()
    labels += updated_diff_df['is_duplicate'].tolist()
  
  return premise_sents, hypothesis_sents, labels

In [None]:
quora_df = pd.read_csv(DATASET_PATH)
quora_df.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [None]:
premise_sents, hypothesis_sents, labels = preprocess_df(quora_df, True)
assert len(premise_sents) == len(hypothesis_sents) == len(labels)
print("Total number of sentences in the dataset is", str(len(labels)))

Total number of sentences in the dataset is 298612


# Data Preprocessing

In [None]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = str(text)
    text = text.lower() # lowercase text
    text = re.sub(REPLACE_BY_SPACE_RE, " ", text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub(BAD_SYMBOLS_RE, "", text)      # delete symbols which are in BAD_SYMBOLS_RE from text
    text = re.sub(' +', ' ', text)
    return text

In [None]:
from sklearn.model_selection import train_test_split

premise_prep = list(map(text_prepare, premise_sents))
hypothesis_prep = list(map(text_prepare, hypothesis_sents))

X = [(premise_prep[i], hypothesis_prep[i]) for i in range(len(labels))]

y = labels
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=46)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=46)

In [None]:
# Check if test set are balanced
counter = 0
for element in y_test:
  if element == 0:
    counter +=1 

print(counter)
print(len(y_test) - counter)

14944
14918


In [None]:
def get_data(X, y):
  left = [s1 for s1, s2 in X]
  right = [s2 for s1, s2 in X]

  LABELS = {'different': 0, 'similar': 1}
#   Y = np_utils.to_categorical(np.array(y), len(LABELS))
 
  return [left, right, np.array(y)]

In [None]:
training = get_data(X_train, y_train)
validation = get_data(X_val, y_val)
testing = get_data(X_test, y_test)

In [None]:
testing[1][:5]

['would banning notes of denominations 500 and 1000 help to curb the black money in system',
 'which country would be the best for setting up a natural cancer clinic bearing in mind the setup cost governmental regulations and the degree of red tape etc',
 'is the damage to prefrontal cortex reversible',
 'how much percentile should i aim for to get into an iim in cat 2016',
 'what are the best ways to build a vacuole model']

In [None]:
word_vec = build_vocab(training[0] + training[1] +
                       validation[0] + validation[1] +
                       testing[0] + testing[1], ROOT_DIR + "/toy-dataset/glove.840B.300d.txt")

Found 58735(/79667) words with glove vectors
Vocab size : 58735


In [None]:
def preprocess(sentences):
  return np.array([  ['<s>'] +
              [word for word in sent.split() if word in word_vec] +
              ['</s>'] 
              for sent in sentences])

for index in [0, 1]:
  training[index] = preprocess(training[index])
  validation[index] = preprocess(validation[index])
  testing[index] = preprocess(testing[index])

In [None]:
training[0][0]

['<s>',
 'harvard',
 'college',
 'courses',
 'what',
 'is',
 'general',
 'shopping',
 'advice',
 'for',
 'german',
 'classes',
 '</s>']

In [None]:
config_nli_model = {
    'n_words'        :  len(word_vec)         , # Number of distinct words in the wordvec
    'word_emb_dim'   :  300                   , # Dimension of word embeddings
    'dpout_model'    :  0.                    , # Dropout
    'enc_lstm_dim'   :  2048                  ,
    'dpout_fc'       :  0.5                   ,
    'fc_dim'         :  512                   ,
    'bsize'          :  64                    ,
    'n_classes'      :  2                     ,
    'pool_type'      :  'max'                 ,
    'nonlinear_fc'   :  0                     ,
    'encoder_type'   :  'InferSent'           , # see list of encoders
    'use_cuda'       :  True                  ,
    'optimizer'      :  "adam"         ,
    'decay'          :  0.99                  ,
    'max_norm'       :  5.                    ,
    'minlr'          :  1e-5                  ,
    'outputdir'      :  CHECKPOINT_DIR        ,
    'outputmodelname':  'e2dmodel.pickle'     ,
    'lrshrink'       :  5                     ,
    'n_epochs'       :  30
}


# Model Architecture


In [None]:
nli_net = NLINet(config_nli_model)
print(nli_net)


# loss
weight = torch.FloatTensor(config_nli_model['n_classes']).fill_(1)
loss_fn = nn.CrossEntropyLoss(weight=weight)
loss_fn.size_average = False

# optimizer
optim_fn, optim_parameters = get_optimizer(config_nli_model['optimizer'])
optimizer = optim_fn(nli_net.parameters(), **optim_parameters)

# cuda by default
nli_net.cuda()
loss_fn.cuda()


val_acc_best = -1e10
adam_stop = False
stop_training = False
lr = optim_parameters['lr'] if 'sgd' in config_nli_model['optimizer'] else None

NLINet(
  (encoder): InferSent(
    (enc_lstm): LSTM(300, 2048, bidirectional=True)
  )
  (classifier): Sequential(
    (0): Linear(in_features=16384, out_features=2, bias=True)
  )
)


In [None]:
# some name changes...
train = training
valid = validation
test = testing

In [None]:
 train[2][1]

0

# Training

In [None]:
def trainepoch(epoch):
    print('\nTRAINING : Epoch ' + str(epoch))
    nli_net.train()
    all_costs = []
    logs = []
    words_count = 0

    last_time = time.time()
    correct = 0.
    # shuffle the data
    permutation = np.random.permutation(len(train[0]))

    s1 = train[0][permutation]
    s2 = train[1][permutation]
    target = train[2][permutation]


    optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] * config_nli_model['decay'] if epoch>1\
        and 'sgd' in config_nli_model['optimizer'] else optimizer.param_groups[0]['lr']
    print('Learning rate : {0}'.format(optimizer.param_groups[0]['lr']))

    for stidx in range(0, len(s1), config_nli_model['bsize']):
        # prepare batch
        s1_batch, s1_len = get_batch(s1[stidx:stidx + config_nli_model['bsize']],
                                     word_vec, config_nli_model['word_emb_dim'])
        s2_batch, s2_len = get_batch(s2[stidx:stidx + config_nli_model['bsize']],
                                     word_vec, config_nli_model['word_emb_dim'])
        s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
        tgt_batch = Variable(torch.LongTensor(target[stidx:stidx + config_nli_model['bsize']])).cuda()
        k = s1_batch.size(1)  # actual batch size

        # model forward
        output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))

        pred = output.data.max(1)[1]
        correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()
        assert len(pred) == len(s1[stidx:stidx + config_nli_model['bsize']])

        # loss
        loss = loss_fn(output, tgt_batch)
        all_costs.append(loss.data.item())
        words_count += (s1_batch.nelement() + s2_batch.nelement()) / config_nli_model['word_emb_dim']

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient clipping (off by default)
        shrink_factor = 1
        total_norm = 0

        for p in nli_net.parameters():
            if p.requires_grad:
                p.grad.data.div_(k)  # divide by the actual batch size
                total_norm += p.grad.data.norm() ** 2
        total_norm = np.sqrt(total_norm.cpu())

        if total_norm > config_nli_model['max_norm']:
            shrink_factor = config_nli_model['max_norm'] / total_norm
        current_lr = optimizer.param_groups[0]['lr'] # current lr (no external "lr", for adam)
        optimizer.param_groups[0]['lr'] = current_lr * shrink_factor # just for update

        # optimizer step
        optimizer.step()
        optimizer.param_groups[0]['lr'] = current_lr

        if len(all_costs) == 1000:
            logs.append('{0} ; loss {1} ; sentence/s {2} ; words/s {3} ; accuracy train : {4}'.format(
                            stidx, 
                            round(np.mean(all_costs), 5),
                            int(len(all_costs) * config_nli_model['bsize'] / (time.time() - last_time)),
                            int(words_count * 1.0 / (time.time() - last_time)),
                            round(100.*correct.data.item()/(stidx+k), 2)))
            print(logs[-1])
            last_time = time.time()
            words_count = 0
            all_costs = []
    train_acc = round(100 * correct.data.item()/len(s1), 2)
    print('results : epoch {0} ; mean accuracy train : {1}'
          .format(epoch, train_acc))
    return train_acc

In [None]:
def evaluate(epoch, eval_type='valid', final_eval=False):
    nli_net.eval()
    correct = 0.
    global val_acc_best, lr, stop_training, adam_stop

    if eval_type == 'valid':
        print('\nVALIDATION : Epoch {0}'.format(epoch))

    s1 = valid[0] if eval_type == 'valid' else test[0]
    s2 = valid[1] if eval_type == 'valid' else test[1]
    target = valid[2] if eval_type == 'valid' else test[2]

    for i in range(0, len(s1), config_nli_model['bsize']):
        # prepare batch
        s1_batch, s1_len = get_batch(s1[i:i + config_nli_model['bsize']], word_vec, config_nli_model['word_emb_dim'])
        s2_batch, s2_len = get_batch(s2[i:i + config_nli_model['bsize']], word_vec, config_nli_model['word_emb_dim'])
        s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())
        tgt_batch = Variable(torch.LongTensor(target[i:i + config_nli_model['bsize']])).cuda()

        # model forward
        output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))

        pred = output.data.max(1)[1]
        correct += pred.long().eq(tgt_batch.data.long()).cpu().sum()

    # save model
    eval_acc = round(100 * correct.data.item()/ len(s1), 3)
    if final_eval:
        print('finalgrep : accuracy {0} : {1}'.format(eval_type, eval_acc))
    else:
        print('togrep : results : epoch {0} ; mean accuracy {1} :\
              {2}'.format(epoch, eval_type, eval_acc))

    if eval_type == 'valid' and epoch <= config_nli_model['n_epochs']:
        if eval_acc > val_acc_best:
            print('saving model at epoch {0}'.format(epoch))
            if not os.path.exists(config_nli_model['outputdir']):
                os.makedirs(config_nli_model['outputdir'])
            torch.save(nli_net.state_dict(), os.path.join(config_nli_model['outputdir'],
                       config_nli_model['outputmodelname']))
            val_acc_best = eval_acc
        else:
            if 'sgd' in config_nli_model['optimizer']:
                optimizer.param_groups[0]['lr'] = optimizer.param_groups[0]['lr'] / config_nli_model['lrshrink']
                print('Shrinking lr by : {0}. New lr = {1}'
                      .format(config_nli_model['lrshrink'],
                              optimizer.param_groups[0]['lr']))
                if optimizer.param_groups[0]['lr'] < config_nli_model['minlr']:
                    stop_training = True
            if 'adam' in config_nli_model['optimizer']:
                # early stopping (at 2nd decrease in accuracy)
                stop_training = adam_stop
                adam_stop = True
    return eval_acc


In [None]:
"""
Train model on Natural Language Inference task
"""
epoch = 1

train_history = []
val_history = []

while not stop_training and epoch <= config_nli_model['n_epochs']:
    train_acc = trainepoch(epoch)
    eval_acc = evaluate(epoch, 'valid')
    epoch += 1
    train_history.append(train_acc)
    val_history.append(eval_acc)

# Run best model on test set.
nli_net.load_state_dict(torch.load(os.path.join(config_nli_model['outputdir'], config_nli_model['outputmodelname'])))

print('\nTEST : Epoch {0}'.format(epoch))
evaluate(1e6, 'valid', True)
evaluate(0, 'test', True)

# Save encoder instead of full model
# torch.save(nli_net.encoder.state_dict(), os.path.join(config_nli_model['outputdir'], config_nli_model['outputmodelname'] + '.encoder.pkl'))



TRAINING : Epoch 1
Learning rate : 0.001
63936 ; loss 0.53297 ; sentence/s 115 ; words/s 7921 ; accuracy train : 76.74
127936 ; loss 0.50052 ; sentence/s 115 ; words/s 7821 ; accuracy train : 78.53
191936 ; loss 0.49187 ; sentence/s 115 ; words/s 7825 ; accuracy train : 79.43
results : epoch 1 ; mean accuracy train : 79.86

VALIDATION : Epoch 1
togrep : results : epoch 1 ; mean accuracy valid :              81.95
saving model at epoch 1

TRAINING : Epoch 2
Learning rate : 0.001
63936 ; loss 0.46337 ; sentence/s 115 ; words/s 7808 ; accuracy train : 84.38
127936 ; loss 0.46496 ; sentence/s 116 ; words/s 7897 ; accuracy train : 84.28
191936 ; loss 0.46118 ; sentence/s 115 ; words/s 7726 ; accuracy train : 84.38
results : epoch 2 ; mean accuracy train : 84.4

VALIDATION : Epoch 2
togrep : results : epoch 2 ; mean accuracy valid :              83.139
saving model at epoch 2

TRAINING : Epoch 3
Learning rate : 0.001
63936 ; loss 0.43428 ; sentence/s 118 ; words/s 7904 ; accuracy train : 87

KeyboardInterrupt: ignored

In [None]:
# summarize history for accuracy
plt.plot(train_history)
plt.plot(val_history)
plt.title('Quora e2e accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
def inference(s1, s2, in_set=False):
  # s1: LIST of premise
  # s2: LIST of hypothesis
  
  if not in_set:
    s1 = preprocess(s1)
    s2 = preprocess(s2)
    
  s1_batch, s1_len = get_batch(s1[0:1], word_vec, config_nli_model['word_emb_dim'])
  s2_batch, s2_len = get_batch(s2[0:1], word_vec, config_nli_model['word_emb_dim'])
  s1_batch, s2_batch = Variable(s1_batch.cuda()), Variable(s2_batch.cuda())

  # model forward
  output = nli_net((s1_batch, s1_len), (s2_batch, s2_len))
  return output.data.max(1)[1].item()

In [None]:
index = 32

sentence1 = train[0][index: index+2]
sentence2 = train[1][index: index+2]
label = train[2][index: index+2][0]


print(sentence1[0])
print(sentence2[0])
print("Prediction is", inference(sentence1, sentence2, in_set=True))
print("Label is", label)

['<s>', 'what', 'should', 'a', 'developer', 'do', 'to', 'become', 'a', 'top', 'developer', 'on', 'google', 'play', '</s>']
['<s>', 'who', 'qualifies', 'to', 'be', 'a', 'top', 'developer', 'on', 'google', 'play', 'store', '</s>']
Prediction is 0
Label is 1
