In [None]:
!pip install transformers

In [2]:
from transformers import BertTokenizer, BertModel
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.legacy.data import Field, BucketIterator, TabularDataset

import spacy
import numpy as np

import random
import math
import time
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
bert = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = False,
                                  )

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [4]:
cls_token_idx = bert_tokenizer.cls_token_id
sep_token_idx = bert_tokenizer.sep_token_id
pad_token_idx = bert_tokenizer.pad_token_id
unk_token_idx = bert_tokenizer.unk_token_id
print(cls_token_idx, sep_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [5]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [7]:
"/content/drive/MyDrive/DLNLP project/socialnetwork.paraphrases.train.examples"
"/content/drive/MyDrive/DLNLP project/socialnetwork.paraphrases.test.examples"

with open('/content/drive/MyDrive/DLNLP project/publications.paraphrases.train.examples','r') as f:
  examples = f.readlines()
  lines = [e.strip().split('\n') for e in examples]


utterance = []
original = []

for line in lines:
  if line[0].find('(utterance') == 0:
    utterance.append(line[0][12:-2])
  if line[0].find('(original') == 0:
    original.append(line[0][11:-2])


import pandas as pd

df = pd.DataFrame({'utterance':utterance, 'original':original})

df['utterance'] = df['utterance'].apply(lambda x : ' '.join(['[CLS]'] + x.split() + ['[SEP]']))
# df['original'] = df['original'].apply(lambda x : ' '.join(['[CLS]'] + x.split() + ['[SEP]']))
df['utterance_mask'] = df['utterance'].apply(lambda x : ' '.join(['1']*len(bert_tokenizer.tokenize(x))))
df['original_mask'] = df['original'].apply(lambda x : ' '.join(['1']*len(x.split())))
df.to_csv('train_data.csv',index=False)

with open('/content/drive/MyDrive/DLNLP project/publications.paraphrases.test.examples','r') as f:
  examples = f.readlines()
  lines = [e.strip().split('\n') for e in examples]


utterance = []
original = []

for line in lines:
  if line[0].find('(utterance') == 0:
    utterance.append(line[0][12:-2])
  if line[0].find('(original') == 0:
    original.append(line[0][11:-2])

df = pd.DataFrame({'utterance':utterance, 'original':original})

df['utterance'] = df['utterance'].apply(lambda x : ' '.join(['[CLS]'] + x.split() + ['[SEP]']))
# df['original'] = df['original'].apply(lambda x : ' '.join(['[CLS]'] + x.split() + ['[SEP]']))
df['utterance_mask'] = df['utterance'].apply(lambda x : ' '.join(['1']*len(bert_tokenizer.tokenize(x))))
df['original_mask'] = df['original'].apply(lambda x : ' '.join(['1']*len(x.split())))
df.to_csv('test_data.csv',index=False)

In [8]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

In [9]:
train_df.tail()

Unnamed: 0,utterance,original,utterance_mask,original_mask
635,[CLS] what article cites the fewest articles [...,article that cites the least number of article,1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1
636,[CLS] what is an article that does not cite mu...,article that multivariate data analysis not cites,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1
637,[CLS] find an article with no more than two ve...,article that has at most two venue,1 1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1
638,[CLS] name an article found in two venues [SEP],article that has two venue,1 1 1 1 1 1 1 1 1,1 1 1 1 1
639,[CLS] what 2004 article was cited by multivari...,article whose publication date is 2004 and tha...,1 1 1 1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1 1 1 1 1


In [10]:
test_df.tail()

Unnamed: 0,utterance,original,utterance_mask,original_mask
156,[CLS] who is the author of an article cited by...,person that is author of article that multivar...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1 1 1 1
157,[CLS] which article won an award and has its v...,article that won an award and whose venue is a...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1 1 1 1 1
158,[CLS] name an article about multivariate data ...,article that cites multivariate data analysis ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1 1 1 1 1 1
159,[CLS] articles with two or more authors [SEP],article that has at least two author,1 1 1 1 1 1 1 1,1 1 1 1 1 1 1
160,[CLS] articles that cite multivariate data ana...,article that cites article that cites multivar...,1 1 1 1 1 1 1 1 1 1,1 1 1 1 1 1 1 1 1


In [11]:
len(train_df['utterance'][2].split()),len(train_df['utterance_mask'][2].split())

(10, 12)

In [12]:
def bert_text_preparation(text, tokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors
    
def get_bert_embeddings(tokens_tensor, segments_tensors, model):

    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [13]:
# spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text):
  return bert_tokenizer.tokenize(text)

def normal_tokenizer(text):
  return text.split()

In [14]:
UTTERANCE = Field(sequential=True,
                  tokenize = tokenizer,
                  use_vocab = False,
                  preprocessing = bert_tokenizer.convert_tokens_to_ids, 
                  pad_token = pad_token_idx,
                  lower = True)

ORIGINAL = Field(sequential=True,
                tokenize = normal_tokenizer, 
                use_vocab = True,            
                init_token = '<sos>',     
                eos_token = '<eos>',
                pad_token = pad_token_idx,
                lower = True)

UTTERANCE_MASK = Field(sequential=True,
                  tokenize = lambda x : x.split(),
                  use_vocab = False, 
                  preprocessing = lambda x : [int(i) for i in x],
                  pad_token = 0
                  )

ORIGINAL_MASK = Field(sequential=True,
                tokenize = lambda x : x.split(), 
                use_vocab = False,        
                preprocessing = lambda x : [int(i) for i in x],         
                pad_token = 0
                )


fields = [('utterance', UTTERANCE), ('original', ORIGINAL),('utterance_mask', UTTERANCE_MASK), ('original_mask', ORIGINAL_MASK)] # ('token_type', TTYPE), ('start',START), ('end',END)]

In [15]:
train_data, test_data = TabularDataset.splits(
                                        path = '/content/',
                                        train = 'train_data.csv',
                                        test = 'test_data.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True)
print(type(train_data))

<class 'torchtext.legacy.data.dataset.TabularDataset'>


In [16]:
UTTERANCE.build_vocab(train_data,test_data) # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'>
ORIGINAL.build_vocab(train_data,test_data)  # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'
UTTERANCE_MASK.build_vocab(train_data,test_data) # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'>
ORIGINAL_MASK.build_vocab(train_data,test_data)  # train_data must be of type <class 'torchtext.legacy.data.dataset.TabularDataset'>

In [17]:
len(ORIGINAL.vocab),len(UTTERANCE.vocab),len(ORIGINAL_MASK.vocab),len(UTTERANCE_MASK.vocab)

(46, 224, 3, 3)

In [18]:
BATCH_SIZE = 4

train_iterator, test_iterator = BucketIterator.splits(
                                                      (train_data, test_data), 
                                                      batch_size = BATCH_SIZE,
                                                      sort_key = lambda x : len(x.original),
                                                      shuffle = False,
                                                      device = device)

In [51]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, bert):
        super().__init__()
        
        self.embedding = bert
        
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True, batch_first = False)
        self.pre_fc = nn.Linear(768,emb_dim)        
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [src len, batch size]
        
        embedded = self.dropout(self.pre_fc(self.embedding(src,src_mask)[0]))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded)
                
        #outputs = [src len, batch size, hid dim * num directions]
        #hidden = [n layers * num directions, batch size, hid dim]
        
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        
        #outputs = [src len, batch size, enc hid dim * 2]
        #hidden = [batch size, dec hid dim]
        
        return outputs, hidden

In [52]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]

        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1) # [b_z, src_len, dec hid dim]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        
        #energy = [batch size, src len, dec hid dim]

        attention = self.v(energy).squeeze(-1)
        
        #attention= [batch size, src len]
        
        return F.softmax(attention, dim=-1)

In [53]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim,emb_dim)
        
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        
        self.fc_out = nn.Linear( dec_hid_dim , output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs):
             
        #input = [batch size]
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
        
        a = self.attention(hidden, encoder_outputs)
                
        #a = [batch size, src len]
        
        a = a.unsqueeze(1)
        
        #a = [batch size, 1, src len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        #encoder_outputs = [batch size, src len, enc hid dim * 2]
        weighted = torch.bmm(a, encoder_outputs)
        
        #weighted = [batch size, 1, enc hid dim * 2]
        
        weighted = weighted.permute(1, 0, 2)
        
        #weighted = [1, batch size, enc hid dim * 2]
        
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        
        #rnn_input = [1, batch size, (enc hid dim * 2) + emb dim]
            
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        #output = [seq len, batch size, dec hid dim * n directions]
        #hidden = [n layers * n directions, batch size, dec hid dim]
        
        #output = [1, batch size, dec hid dim]
        #hidden = [1, batch size, dec hid dim]
        # assert (output == hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        # prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        prediction = self.fc_out(output)
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden.squeeze(0)

In [54]:

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, src_mask, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src, src_mask)
        
        # <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):

            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1) 
            
            input = trg[t] if teacher_force else top1

        return outputs

In [55]:
INPUT_DIM = len(UTTERANCE.vocab)
OUTPUT_DIM = len(ORIGINAL.vocab)
ENC_EMB_DIM = 128  # from BERT
DEC_EMB_DIM = 128  # from BERT
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

attn = Attention(ENC_HID_DIM, DEC_HID_DIM).to(device)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT, bert).to(device)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn).to(device)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)  #  initialise bias with 0
            
model.apply(init_weights)

In [57]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 115,453,614 trainable parameters


In [58]:
optimizer = optim.Adam(model.parameters(), lr = 7e-6)

TRG_PAD_IDX = ORIGINAL.vocab.stoi[ORIGINAL.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [59]:

def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(train_iterator):
        
        src = batch.utterance
        src_mask = batch.utterance_mask
        trg = batch.original

        # print(src.shape,src_mask.shape, trg.shape)
        
        optimizer.zero_grad()
        output = model(src, src_mask, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [60]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.utterance
            src_mask = batch.utterance_mask
            trg = batch.original

            output = model(src, src_mask, trg, 0) 

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size] , excludes '<sos>'
            #output = [(trg len - 1) * batch size, output dim],  excludes '<sos>'

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)



In [61]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [70]:
N_EPOCHS = 12
CLIP = 1

# best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, test_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seq+bert-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} |  Val. Loss: {valid_loss:.3f}')


Epoch: 01 | Time: 0m 30s
	Train Loss: 2.231 |  Val. Loss: 2.349
Epoch: 02 | Time: 0m 30s
	Train Loss: 2.229 |  Val. Loss: 2.335
Epoch: 03 | Time: 0m 30s
	Train Loss: 2.216 |  Val. Loss: 2.337
Epoch: 04 | Time: 0m 30s
	Train Loss: 2.216 |  Val. Loss: 2.337
Epoch: 05 | Time: 0m 30s
	Train Loss: 2.213 |  Val. Loss: 2.333
Epoch: 06 | Time: 0m 30s
	Train Loss: 2.208 |  Val. Loss: 2.327
Epoch: 07 | Time: 0m 30s
	Train Loss: 2.201 |  Val. Loss: 2.323
Epoch: 08 | Time: 0m 30s
	Train Loss: 2.199 |  Val. Loss: 2.323
Epoch: 09 | Time: 0m 30s
	Train Loss: 2.191 |  Val. Loss: 2.322
Epoch: 10 | Time: 0m 30s
	Train Loss: 2.187 |  Val. Loss: 2.314
Epoch: 11 | Time: 0m 30s
	Train Loss: 2.181 |  Val. Loss: 2.316
Epoch: 12 | Time: 0m 30s
	Train Loss: 2.174 |  Val. Loss: 2.317


In [71]:
model = Seq2Seq(enc, dec, device).to(device)
PATH = '/content/seq2seq+bert-model.pt'
model.load_state_dict(torch.load(PATH, map_location=device))

<All keys matched successfully>

In [72]:
def match(model, iterator):
    
    model.eval()
    
    epoch_loss = 0
    exact_match = 0
    op_list = []
    gold_list = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.utterance
            src_mask = batch.utterance_mask
            trg = batch.original
            # print(src)

            # for batch in iterator:
            #   for i in batch.original:
            #     print([ORIGINAL.vocab.itos[x] for x in i.detach().cpu().numpy().tolist()])

            output = model(src, src_mask, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            assert (trg.shape[0],trg.shape[1]) == (output.shape[0],output.shape[1])

            output_dim = output.shape[-1]
            
            output = output.argmax(dim=-1).permute(1,0)
            output[:,0] = 2
            trg = trg.permute(1,0)


            exact_match += torch.sum(torch.all(output == trg, dim=1))

            for b in range(trg.shape[0]):
              temp = []
              gemp = []
              for i in range(trg.shape[1]):
                temp.append(ORIGINAL.vocab.itos[output[b,i]])
                gemp.append(ORIGINAL.vocab.itos[trg[b,i]])
              op_list.append(temp)
              gold_list.append(gemp)


    return exact_match, op_list, gold_list


In [74]:
model.load_state_dict(torch.load('/content/seq2seq+bert-model.pt',map_location=device))

exact_match, op_list, gold_list = match(model, test_iterator)

print(f'Exact match : {exact_match}')

Exact match : 33


In [34]:
len(test_df)

161