In [1]:
import torch
import torch.nn as nn

import torch.nn.functional as F

from torch.cuda.amp import autocast, GradScaler

from torch.utils.tensorboard import SummaryWriter

In [2]:
from torchtext.legacy.data import BucketIterator, Field, TabularDataset

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import random
import json
import time

In [5]:
import spacy
import pkbar

In [6]:
SEED = 42069

random.seed(SEED)
# np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device ="cpu"

In [8]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download es_core_web_sm

In [9]:
spacy_esp = spacy.load('es_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [10]:
def tokenize_esp(text):
#     return [tok.text for tok in spacy_esp.tokenizer(text)]
    return [tok.text for tok in spacy_esp.tokenizer(text)][::-1] ##Reversing as stated in seq2seq paper

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [11]:
SRC = Field(sequential = True, 
            tokenize = tokenize_en, 
            init_token = "<sos>",
            eos_token = "<eos>",
            lower=True,
            batch_first=True)

In [12]:
# !python -m spacy download es_core_news_lg

In [13]:
TRG = Field(sequential = True,
           tokenize=tokenize_esp,
           init_token='<sos>',
           eos_token='<eos>',
           lower=True,
           batch_first=True)

In [14]:
datasetfields = [("eng", SRC),("spa", TRG)]

In [15]:
data =  TabularDataset('./eng-spa.csv', format='csv', fields=datasetfields, skip_header=True)

In [16]:
train_data, val_data, test_data = data.split([0.8, 0.1, 0.1])

In [None]:
SRC.build_vocab(train_data, min_freq=1)
TRG.build_vocab(train_data, min_freq=1)

In [None]:
eng_vocab = SRC.vocab
spa_vocab = TRG.vocab

In [None]:
batch_size = 256

In [None]:
train_iter, val_iter, test_iter = BucketIterator.splits((train_data, val_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.eng))

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, vocab, hidden_dim, encoder_embedding_dim, num_layers, dropout):
        
        super(Encoder, self).__init__() 
        
        self.vocab_size = len(vocab.itos)
        self.embedding_dim = encoder_embedding_dim
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        
        self.emb = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embedding_dim, padding_idx=1)
        
        self.GRU = nn.GRU(self.embedding_dim, hidden_dim, num_layers=self.num_layers, dropout=dropout, batch_first=True, bidirectional=True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, sentence, hidden_state):
        
        embedded = self.dropout(self.emb(sentence))
        
        out, hidden_state = self.GRU(embedded, hidden_state)
        
        return out, hidden_state
    
    def initHidden(self, BATCH_SIZE):
        return torch.zeros(self.num_layers * 2, BATCH_SIZE ,self.hidden_dim)


In [None]:
class Decoder(nn.Module):
    
    def __init__(self, vocab, hidden_dim, embedding_dim, num_layers, dropout):
        
        super(Decoder, self).__init__()
        
        self.vocab_size = len(vocab.itos)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        
        self.GRU = nn.GRU(self.embedding_dim, hidden_dim, num_layers=self.num_layers, batch_first=True, dropout=dropout)
        
        self.embedding = nn.Embedding(self.vocab_size, embedding_dim=self.embedding_dim, padding_idx=1)
        
        self.dropout = nn.Dropout(dropout)
        
        #self.FC = nn.Linear(3072, 1024)
        
    def forward(self, y, hidden_state, context_vector):
        
        y = self.dropout(self.embedding(y))
        
        #print("y after embedding and dropout", y.shape)
        #print("context vector shape", context_vector.shape)
        
        #y = torch.cat((context_vector, y), dim=-1)
        
        #y = self.FC(y)
        #y = F.relu(y)
        
        rnn_out, decoder_hidden_state = self.GRU(y, hidden_state)
                        
        return rnn_out, decoder_hidden_state

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, hidden_dim, embedding_dim, hidden_layers, english_vocab, spanish_vocab, dropout):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.hidden_layers = hidden_layers
        
        self.eng_vocab = english_vocab
        self.spa_vocab = spanish_vocab
        
        self.encoder = Encoder(self.eng_vocab, hidden_dim, embedding_dim, hidden_layers, dropout)
        self.decoder = Decoder(self.spa_vocab, hidden_dim, embedding_dim, hidden_layers * 2, dropout)
        
        self.AttentionFC = nn.Linear(hidden_dim, self.decoder.vocab_size, bias=False)

    def DotProductAttention(self, encoder_outputs, decoder_hidden_state):
        
        current_b_size = encoder_outputs.shape[0]
        
        encoder_outputs = encoder_outputs.view(current_b_size, -1,  2, self.hidden_dim)
#         print("encoder_outputs before concat", encoder_outputs.shape)
        encoder_outputs = torch.cat((encoder_outputs[:, :, 0, :], encoder_outputs[:, :, 1, :]), 1)
        
#         print("encoder outputs after concat", encoder_outputs.shape)
        score = torch.bmm(decoder_hidden_state.permute(1, 0, 2), encoder_outputs.permute(0, 2, 1))
#         print("score", score.shape)
        attention_weights = F.softmax(score, dim=-1)
#         print("attention weights shape", attention_weights.shape)
        context_vector = torch.bmm(attention_weights, encoder_outputs)
#         print("context_vector shape", context_vector.shape)
        #context_vector = torch.sum(context_vector, dim=1, keepdim=True)
        #print("context_vector shape", context_vector.shape)
        #print(context_vector)
        return context_vector, attention_weights
    
    def forward(self, x, y, teacher_forcing = 0):        
        current_batch_size, max_seq_len = y.shape

        encoder_hidden = self.encoder.initHidden(current_batch_size).to(device)
        encoder_outputs, encoder_hidden = self.encoder.forward(x, encoder_hidden)
        
        #encoder_outputs, encoder_hidden = self.encoder.forward(x)        
        
        decoder_hidden = encoder_hidden

        del encoder_hidden

        outputs = torch.zeros(size=(max_seq_len - 1, current_batch_size, self.decoder.vocab_size)).to(device)

        prev_word = torch.zeros_like(y[:, 0])

        for i in range(max_seq_len - 1):
            
            context_vector, attention_weights = self.DotProductAttention(encoder_outputs, decoder_hidden)
            
            if random.random() < teacher_forcing: #Teacher forcing
                
                #print("Context Vector Shape", context_vector.shape, "y[:, i] shape", y[:, i].unsqueeze(1).shape)
                
                rnn_out, decoder_hidden = self.decoder.forward(y[:, i].unsqueeze(1), decoder_hidden, context_vector)
            else:
                rnn_out, decoder_hidden = self.decoder.forward(prev_word.unsqueeze(1), decoder_hidden, context_vector) 
            
            #print(rnn_out.shape, decoder_hidden.shape)
            logits = self.AttentionFC(rnn_out)
            
            prev_word = logits.argmax(dim=-1).squeeze(1)
            
            outputs[i] = logits.squeeze(1)
        
        return outputs

In [None]:
hidden_dim = 1024
hidden_layers = 2
embedding_dim = 512

dropout = 0.7

In [None]:
model = Seq2Seq(hidden_dim=hidden_dim, embedding_dim=embedding_dim, hidden_layers=hidden_layers, english_vocab=eng_vocab, spanish_vocab=spa_vocab, dropout=dropout).to(device)

In [None]:
def init_weights(model):
    for param_name, parameter in model.named_parameters():
        nn.init.uniform_(parameter.data, -0.08, 0.08)
        
model.apply(init_weights)

In [None]:
# outputs = model.forward(x, y, teacher_forcing)

In [None]:
def configure_optimizers(model, lr=1e-3):
    return torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
def configure_sgd(model, lr=0.01):
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    return optimizer

In [None]:
# outputs = model.forward(x, y, teacher_forcing)

In [None]:
optimizer = configure_optimizers(model)

# lr = 0.01 # learning rate
# optimizer = torch.optim.SGD(model.parameters(), lr=lr)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=1)

In [None]:
scaler = GradScaler()

In [None]:
def train_step(model, optimizer, x, y, teacher_forcing = 0.5, clip=1.0, fp16=False):
    
    #torch.cuda.empty_cache()
    
    optimizer.zero_grad(set_to_none=True)
    
    model.train()
        
    with autocast(fp16):
        outputs = model.forward(x, y, teacher_forcing)
        loss = criterion(outputs.permute(1, 2, 0), y[:, 1:])
    
    
    scaler.scale(loss).backward()
    
    scaler.unscale_(optimizer) ##Need to unscale before clipping

    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    scaler.step(optimizer)
    
    scaler.update()
    
    optimizer.zero_grad(set_to_none=True)

    return loss

In [None]:
def evaluate(model, iterator):
    
    model.eval()
    
    eval_loss = 0
    
    with torch.no_grad():
        
        for _, batch in enumerate(iterator):
            
            x, y = batch.eng, batch.spa

            x = x.to(device)
            y = y.to(device)

            outputs = model.forward(x, y, teacher_forcing=0)
            loss = criterion(outputs.permute(1, 2, 0), y[:, 1:]) 
            
            eval_loss += loss.item()
            
    return eval_loss / len(iterator)

In [None]:
epochs = 200 # For 10 hours

fp16=True

gradient_clip = 6.0
teacher_forcing = 0.3

In [None]:
logdir = "attention-logs/{}".format(time.ctime())

experiment_description = "Attention Version: 0.2.1: With Reversed"

writer = SummaryWriter(log_dir=logdir, comment=experiment_description)

writer.add_text("Experiment", experiment_description)

hparams = {'batch_size': batch_size, 
                    'hidden_dim': hidden_dim, 
                    'RNN hidden_layers': hidden_layers,
                    'embedding_dim': embedding_dim,
                    'dropout': dropout,
                    'lr': optimizer.param_groups[-1]['lr'],
                    'mixed precision': fp16,
                    'gradient clipping': gradient_clip,
                    'teacher forcing': teacher_forcing}

writer.add_text("hparams", json.dumps(hparams))

In [None]:
best_val_loss = float("inf")

best_model = None

n_iter = 0

train_per_epoch = len(train_iter)

for epoch in range(epochs):
    
    epoch_loss = 0
    
    print("Epoch: {}, Started: {}".format(epoch+1, time.ctime()))
    print("---------------------------------------------------------")
    
    kbar = pkbar.Kbar(target=train_per_epoch, epoch = epoch, num_epochs = epochs, width = 8, always_stateful = False)

    if epoch > 5: ##Use SGD later in training
            optimizer = configure_sgd(model)

    
    for batch_IDX, batch in enumerate(train_iter):    
        
        x, y = batch.eng, batch.spa
        
        x = x.to(device)
        y = y.to(device)
                
        batch_loss = train_step(model, optimizer, x, y, teacher_forcing, gradient_clip, fp16)
        
        writer.add_scalar('Loss/Train', batch_loss, n_iter)
        epoch_loss += batch_loss
        
        #if epoch == 0 or batch_IDX < 3:
        #    torch.cuda.empty_cache()
                
        kbar.update(batch_IDX, values = [('loss', batch_loss)])
        n_iter += 1
        
    val_loss = evaluate(model, val_iter)
    
    writer.add_scalar('Loss/Validation', val_loss, epoch)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    kbar.add(1, values= [('val_loss', val_loss)])
    #scheduler.step()
    #print("Epoch: {}, Mean Epoch Loss: {:.4f}, Valid Loss: {:.4f}, Learning Rate: {:.4f} ".format(epoch+1, epoch_loss / len(train_iter), val_loss, scheduler.get_lr()[0]))
    
    print("---------------------------------------------------------")
    print("\n")
    


In [None]:
def generate_translation(model, sentence, max_len=10):
    
    model.eval()    
    sentence = SRC.tokenize(sentence)
    sentence = [[eng_vocab.stoi.get(token, eng_vocab.unk_index) for token in sentence]]
    sentence = torch.LongTensor(sentence).to(device)
    
    encoder_hidden = model.encoder.initHidden(1).to(device)
    encoder_outputs, encoder_hidden = model.encoder(sentence, encoder_hidden)
    #encoder_outputs, encoder_hidden = model.encoder(sentence)

    decoder_hidden = encoder_hidden
    
    del encoder_hidden
    
    word = [[2]]
    word = torch.LongTensor(word).to(device)
    
    translation = []
    
    i = 0
    word_str = None
    while i < max_len and word_str != '<eos>':
        context_vector = model.DotProductAttention(encoder_outputs, decoder_hidden)

        decoder_out, decoder_hidden = model.decoder(word, decoder_hidden, context_vector)

        logits = model.AttentionFC(decoder_out)

        word = logits.argmax(dim=-1)
        word_str = spa_vocab.itos[word.item()]
        
        translation.append(word_str)
        i += 1
    
    return translation[::-1]

In [None]:
with torch.no_grad():
    sentence = generate_translation(best_model, "There are four main causes of alcohol-related death. Injury from car accidents or violence is one. Diseases like cirrhosis of the liver, cancer, heart and blood system diseases are the others.")
    print(' '.join(sentence))

In [None]:
evaluate(best_model, test_iter)