In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.data import Field, BucketIterator, TabularDataset

import spacy
import numpy as np

import random
import math
import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
"/content/drive/MyDrive/DLNLP project/socialnetwork.paraphrases.train.examples"
"/content/drive/MyDrive/DLNLP project/socialnetwork.paraphrases.test.examples"

with open('/content/drive/MyDrive/DLNLP project/publications.paraphrases.train.examples','r') as f:
  examples = f.readlines()
  lines = [e.strip().split('\n') for e in examples]


utterance = []
original = []

for line in lines:
  if line[0].find('(utterance') == 0:
    utterance.append(line[0][12:-2])
  if line[0].find('(original') == 0:
    original.append(line[0][11:-2])


import pandas as pd
pd.DataFrame({'utterance':utterance, 'original':original}).to_csv('train_data.csv',index=False)

with open('/content/drive/MyDrive/DLNLP project/publications.paraphrases.test.examples','r') as f:
  examples = f.readlines()
  lines = [e.strip().split('\n') for e in examples]


utterance = []
original = []

for line in lines:
  if line[0].find('(utterance') == 0:
    utterance.append(line[0][12:-2])
  if line[0].find('(original') == 0:
    original.append(line[0][11:-2])

pd.DataFrame({'utterance':utterance, 'original':original}).to_csv('test_data.csv',index=False)

In [5]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

In [6]:
train_df.tail()

Unnamed: 0,utterance,original
635,what article cites the fewest articles,article that cites the least number of article
636,what is an article that does not cite multivar...,article that multivariate data analysis not cites
637,find an article with no more than two venues,article that has at most two venue
638,name an article found in two venues,article that has two venue
639,what 2004 article was cited by multivariate da...,article whose publication date is 2004 and tha...


In [7]:
len(train_df)

640

In [8]:
test_df.tail()

Unnamed: 0,utterance,original
156,who is the author of an article cited by multi...,person that is author of article that multivar...
157,which article won an award and has its venue a...,article that won an award and whose venue is a...
158,name an article about multivariate data analys...,article that cites multivariate data analysis ...
159,articles with two or more authors,article that has at least two author
160,articles that cite multivariate data analysis,article that cites article that cites multivar...


In [9]:
len(test_df)

161

In [10]:
spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text):
  # return text.split()
  return [tok.text for tok in spacy_en.tokenizer(text)]

In [11]:
UTTERANCE = Field(sequential=True,
                  tokenize = tokenizer,
                  use_vocab = True, 
                  init_token = '<sos>', 
                  eos_token = '<eos>', 
                  lower = True)

ORIGINAL = Field(sequential=True,
                tokenize = tokenizer, 
                use_vocab = True,                 
                init_token = '<sos>', 
                eos_token = '<eos>', 
                lower = True)


fields = [('utterance', UTTERANCE), ('original', ORIGINAL)] # ('token_type', TTYPE), ('start',START), ('end',END)]

In [12]:
train_data, test_data = TabularDataset.splits(
                                        path = '/content/',
                                        train = 'train_data.csv',
                                        test = 'test_data.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True)
print(type(train_data))

<class 'torchtext.legacy.data.dataset.TabularDataset'>


In [13]:
UTTERANCE.build_vocab(train_data,test_data) # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'>
ORIGINAL.build_vocab(train_data,test_data)  # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'

In [14]:
len(ORIGINAL.vocab),len(UTTERANCE.vocab)

(46, 205)

In [15]:
UTTERANCE.vocab.stoi['<sos>'],ORIGINAL.vocab.stoi['<sos>']

(2, 2)

In [16]:
print(test_data[0].__dict__.keys())
print(test_data[2].__dict__.values())
# print(train_data[1].start,train_data[1].end)

dict_keys(['utterance', 'original'])
dict_values([['what', 'person', 'is', 'not', 'the', 'author', 'of', 'multivariate', 'data', 'analysis'], ['person', 'that', 'is', 'not', 'author', 'of', 'multivariate', 'data', 'analysis']])


In [17]:
BATCH_SIZE = 4

train_iterator, test_iterator = BucketIterator.splits(
                                                      (train_data, test_data), 
                                                      batch_size = BATCH_SIZE,
                                                      sort_key = lambda x : len(x.original),
                                                      shuffle = False,
                                                      device = device)

In [18]:
# for batch in test_iterator:
#   for i in batch.utterance:
#     print([UTTERANCE.vocab.itos[x] for x in i.detach().cpu().numpy().tolist()])
    

In [19]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
                
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

In [20]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(2)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=1)

In [21]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention, encoder):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim,emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear( dec_hid_dim , output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        # prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        prediction = self.fc_out(output)
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)

In [22]:

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        encoder_outputs, hidden = self.encoder(src)
                
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1) 
            input = trg[t] if teacher_force else top1

        return outputs

In [23]:
INPUT_DIM = len(UTTERANCE.vocab)
OUTPUT_DIM = len(ORIGINAL.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM).to(device)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT).to(device)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn, enc).to(device)

model = Seq2Seq(enc, dec, device).to(device)

In [24]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)  #  initialise bias with 0
            
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(205, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(46, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=512, out_features=46, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [25]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 6,521,134 trainable parameters


In [26]:
optimizer = optim.Adam(model.parameters(), lr = 3e-4)

TRG_PAD_IDX = ORIGINAL.vocab.stoi[ORIGINAL.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [27]:

def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(train_iterator):
        
        src = batch.utterance
        trg = batch.original

        # print(src.shape,trg.shape)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [28]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.utterance
            trg = batch.original

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size] , excludes '<sos>'
            #output = [(trg len - 1) * batch size, output dim],  excludes '<sos>'

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)



In [29]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [31]:
N_EPOCHS = 12
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    # valid_loss = evaluate(model, test_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if train_loss < best_train_loss:
        best_train_loss = train_loss
        torch.save(model.state_dict(), 'seq2seq.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}')

Epoch: 01 | Time: 0m 9s
	Train Loss: 2.105 | Val. Loss: 2.447
Epoch: 02 | Time: 0m 8s
	Train Loss: 1.856 | Val. Loss: 2.462
Epoch: 03 | Time: 0m 8s
	Train Loss: 1.618 | Val. Loss: 2.478
Epoch: 04 | Time: 0m 8s
	Train Loss: 1.443 | Val. Loss: 2.287
Epoch: 05 | Time: 0m 8s
	Train Loss: 1.238 | Val. Loss: 2.364
Epoch: 06 | Time: 0m 8s
	Train Loss: 1.138 | Val. Loss: 2.226
Epoch: 07 | Time: 0m 8s
	Train Loss: 0.987 | Val. Loss: 2.175
Epoch: 08 | Time: 0m 8s
	Train Loss: 0.925 | Val. Loss: 1.958
Epoch: 09 | Time: 0m 8s
	Train Loss: 0.811 | Val. Loss: 1.923
Epoch: 10 | Time: 0m 8s
	Train Loss: 0.767 | Val. Loss: 1.956
Epoch: 11 | Time: 0m 8s
	Train Loss: 0.726 | Val. Loss: 1.863
Epoch: 12 | Time: 0m 8s
	Train Loss: 0.642 | Val. Loss: 1.800


In [34]:
model = Seq2Seq(enc, dec, device).to(device)
# PATH = 
# model.load_state_dict(torch.load(PATH, map_location=device))

In [32]:
def match(model, iterator):
    
    model.eval()
    
    epoch_loss = 0
    exact_match = 0
    op_list = []
    gold_list = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.utterance
            trg = batch.original
            # print(src)

            # for batch in iterator:
            #   for i in batch.original:
            #     print([ORIGINAL.vocab.itos[x] for x in i.detach().cpu().numpy().tolist()])

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            assert (trg.shape[0],trg.shape[1]) == (output.shape[0],output.shape[1])

            output_dim = output.shape[-1]
            
            output = output.argmax(dim=-1).permute(1,0)
            output[:,0] = 2
            trg = trg.permute(1,0)


            exact_match += torch.sum(torch.all(output == trg, dim=1))

            for b in range(trg.shape[0]):
              temp = []
              gemp = []
              for i in range(trg.shape[1]):
                temp.append(ORIGINAL.vocab.itos[output[b,i]])
                gemp.append(ORIGINAL.vocab.itos[trg[b,i]])
              op_list.append(temp)
              gold_list.append(gemp)


    return exact_match, op_list, gold_list


In [42]:
model.load_state_dict(torch.load('/content/seq2seq.pt',map_location=device))

exact_match, op_list, gold_list = match(model, test_iterator)

print(f'Exact match : {exact_match}')

Exact match : 42


In [43]:
len(test_df)

161