In [None]:
import torch
import torch.nn as nn

import torch.nn.functional as F

from torch.cuda.amp import autocast, GradScaler

from torch.utils.tensorboard import SummaryWriter

In [None]:
from torchtext.legacy.data import BucketIterator, Field, TabularDataset

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import random
import pkbar
import time

In [None]:
import spacy

In [None]:
SEED = 42069

random.seed(SEED)
# np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
scaler = GradScaler()

In [None]:
spacy_esp = spacy.load('es_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [None]:
def tokenize_esp(text):
    #return [tok.text for tok in spacy_esp.tokenizer(text)]
    return [tok.text for tok in spacy_esp.tokenizer(text)][::-1] ##Reversing as stated in seq2seq paper

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
SRC = Field(sequential = True, 
            tokenize = tokenize_en, 
            init_token = "<sos>",
            eos_token = "<eos>",
            lower=True,
            batch_first=True)

In [None]:
# !python -m spacy download es_core_news_lg

In [None]:
TRG = Field(sequential = True,
           tokenize=tokenize_esp,
           init_token='<sos>',
           eos_token='<eos>',
           lower=True,
           batch_first=True)

In [None]:
datasetfields = [("eng", SRC),("spa", TRG)]

In [None]:
data =  TabularDataset('./eng-spa.csv', format='csv', fields=datasetfields, skip_header=True)

In [None]:
train_data, val_data, test_data = data.split([0.8, 0.1, 0.1])

In [None]:
SRC.build_vocab(train_data, min_freq=1)
TRG.build_vocab(train_data, min_freq=1)

In [None]:
eng_vocab = SRC.vocab
spa_vocab = TRG.vocab

In [None]:
batch_size = 128

In [None]:
train_iter, val_iter, test_iter = BucketIterator.splits((train_data, val_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.eng))

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, vocab, hidden_dim, encoder_embedding_dim, num_layers, dropout):
        
        super(Encoder, self).__init__() 
        
        self.vocab_size = len(vocab.itos)
        self.embedding_dim = encoder_embedding_dim
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        
        self.emb = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embedding_dim, padding_idx=1)
        
        self.GRU = nn.GRU(self.embedding_dim, hidden_dim, num_layers=self.num_layers, dropout=dropout, batch_first=True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, sentence):
        
        embedded = self.dropout(self.emb(sentence))
        
        out, hidden_state = self.GRU(embedded)
        
        return out, hidden_state
    
    def initHidden(self, BATCH_SIZE):
        return torch.zeros(self.num_layers, BATCH_SIZE ,self.hidden_dim)


In [None]:
class Decoder(nn.Module):
    
    def __init__(self, vocab, hidden_dim, embedding_dim, num_layers, dropout):
        
        super(Decoder, self).__init__()
        
        self.vocab_size = len(vocab.itos)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        
        self.GRU = nn.GRU(self.embedding_dim, hidden_dim, num_layers=self.num_layers, batch_first=True, dropout=dropout)
        
        self.embedding = nn.Embedding(self.vocab_size, embedding_dim=self.embedding_dim, padding_idx=1)
        
        self.fc = nn.Linear(hidden_dim, self.vocab_size)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, y, hidden_state):
        
        y = self.dropout(self.embedding(y))
        
        lstm_out, decoder_hidden_state = self.GRU(y, hidden_state)
        
        logits = self.fc(lstm_out)
                
        return logits, decoder_hidden_state

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, hidden_dim, embedding_dim, hidden_layers, english_vocab, spanish_vocab, dropout):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.hidden_layers = hidden_layers
        
        self.eng_vocab = english_vocab
        self.spa_vocab = spanish_vocab
        
        self.encoder = Encoder(self.eng_vocab, hidden_dim, embedding_dim, hidden_layers, dropout)
        self.decoder = Decoder(self.spa_vocab, hidden_dim, embedding_dim, hidden_layers, dropout)
        
    
    def forward(self, x, y, teacher_forcing = 0):
        self.encoder.train()
        self.decoder.train()
        
        current_batch_size, max_seq_len = y.shape

        #encoder_hidden = self.encoder.initHidden(current_batch_size).to(device)
        #encoder_output, encoder_hidden = self.encoder.forward(x, encoder_hidden)
        
        encoder_output, encoder_hidden = self.encoder.forward(x)
        decoder_hidden = encoder_hidden
        
        del encoder_hidden
        
        outputs = torch.zeros(size=(max_seq_len - 1, current_batch_size, self.decoder.vocab_size)).to(device)

        prev_word = torch.zeros_like(y[:, 0])

        for i in range(max_seq_len - 1):

            if random.random() < teacher_forcing: #Teacher forcing
                logits, decoder_hidden = self.decoder.forward(y[:, i].unsqueeze(1), decoder_hidden)
            else:
                logits, decoder_hidden = self.decoder.forward(prev_word.unsqueeze(1), decoder_hidden)  #Teacher forcing: Get random then pass i from y if > proba else pass previous scores: TODO

            prev_word = logits.argmax(dim=-1).squeeze(1)

            outputs[i] = logits.squeeze(1)
        
        return outputs

In [None]:
hidden_dim = 1024
hidden_layers = 2
embedding_dim = 512

dropout = 0.7

In [None]:
model = Seq2Seq(hidden_dim=hidden_dim, embedding_dim=embedding_dim, hidden_layers=hidden_layers, english_vocab=eng_vocab, spanish_vocab=spa_vocab, dropout=dropout).to(device)

In [None]:
def init_weights(model):
    for param_name, parameter in model.named_parameters():
        nn.init.uniform_(parameter.data, -0.08, 0.08)
        
model.apply(init_weights)

In [None]:
def configure_optimizers(model, lr=1e-5, weight_decay=0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

In [None]:
optimizer = configure_optimizers(model)

# lr = 2.0 # learning rate
# optimizer = torch.optim.SGD(model.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=1)

In [None]:
logdir = "logs/seq2seqvanilla/{}".format(time.ctime())
writer = SummaryWriter(log_dir=logdir)

In [None]:
def train_step(model, optimizer, x, y, teacher_forcing = 0.8, clip=5.0, fp16=True):
    
    #torch.cuda.empty_cache()
    
    model.train()
    
    optimizer.zero_grad(set_to_none=True)
    
    with autocast(fp16):
        outputs = model.forward(x, y, teacher_forcing)
        loss = criterion(outputs.permute(1, 2, 0), y[:, 1:])
        
    scaler.scale(loss).backward()
    
    scaler.unscale_(optimizer) ##Need to unscale before clipping

    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    scaler.step(optimizer)
    
    scaler.update()
    
    optimizer.zero_grad(set_to_none=True)
        
    return loss

In [None]:
def evaluate(model, iterator):
    
    model.eval()
    
    eval_loss = 0
    
    with torch.no_grad():
        
        for _, batch in enumerate(iterator):
            
            x, y = batch.eng, batch.spa

            x = x.to(device)
            y = y.to(device)

            outputs = model.forward(x, y, teacher_forcing=0)
            loss = criterion(outputs.permute(1, 2, 0), y[:, 1:]) 
            
            eval_loss += loss.item()
    
    torch.cuda.empty_cache()
    return eval_loss / len(iterator)

In [None]:
epochs = 100 # For 10 hours
fp16=True

In [None]:
best_val_loss = float("inf")

best_model = None

n_iter = 0

train_per_epoch = len(train_iter)

for epoch in range(epochs):
    torch.cuda.empty_cache()
    epoch_loss = 0
    
    print("Epoch: {}, Started: {}".format(epoch+1, time.ctime()))
    print("---------------------------------------------------------")

    kbar = pkbar.Kbar(target=train_per_epoch, epoch = epoch, num_epochs = epochs, width = 8, always_stateful = False)
    
    for batch_IDX, batch in enumerate(train_iter):    
        
        x, y = batch.eng, batch.spa
        
        x = x.to(device)
        y = y.to(device)

        batch_loss = train_step(model, optimizer, x, y, fp16)
        
        writer.add_scalar('Loss/Train', batch_loss, n_iter)

        epoch_loss += batch_loss
        
        #if epoch == 0 or batch_IDX < 3:
        #    torch.cuda.empty_cache()
                
        kbar.update(batch_IDX, values = [('loss', batch_loss)])
        
        n_iter +=1
    
    val_loss = evaluate(model, val_iter)
    
    writer.add_scalar('Loss/Validation', val_loss, epoch)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    
    kbar.add(1, values= [('val_loss', val_loss)])
    #scheduler.step()
    
    print("\n")
    #print("Epoch: {}, Mean Epoch Loss: {:.4f}, Valid Loss: {:.4f}, Learning Rate: {:.4f} ".format(epoch+1, epoch_loss / len(train_iter), val_loss, scheduler.get_lr()[0]))
    print("---------------------------------------------------------")
    print("\n")
    
    

In [None]:
def generate_translation(encoder, decoder, sentence, max_len=30):
    
    encoder.eval()
    decoder.eval()
    
    sentence = SRC.tokenize(sentence)
    sentence = [[eng_vocab.stoi.get(token, eng_vocab.unk_index) for token in sentence]]
    sentence = torch.LongTensor(sentence).to(device)
    
    #encoder_hidden = encoder.initHidden(1).to(device)
    #_, encoder_hidden = encoder(sentence, encoder_hidden)
    _, encoder_hidden = encoder(sentence)

    decoder_hidden = encoder_hidden
    
    del encoder_hidden
    
    word = [[2]]
    word = torch.LongTensor(word).to(device)
    
    translation = []
    
    i = 0
    word_str = None
    while i < max_len and word_str != '<eos>':
        
        decoder_out, decoder_hidden = decoder(word, decoder_hidden)
        
        word = decoder_out.argmax(dim=-1)
        word_str = spa_vocab.itos[word.item()]
        translation.append(word_str)
        i += 1
    
    return translation[::-1][1:]

In [None]:
with torch.no_grad():
    sentence = generate_translation(best_model.encoder, best_model.decoder, "fuck you")
    print(' '.join(sentence))

In [None]:
evaluate(model, test_iter)