In [1]:
import torch
import torch.nn as nn

import torch.nn.functional as F

from torch.cuda.amp import autocast

In [2]:
from torchtext.data import BucketIterator, Field, TabularDataset
from torchtext.datasets import Multi30k

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import random

import time

In [5]:
import spacy

In [6]:
SEED = 42069

random.seed(SEED)
# np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [9]:
def tokenize_de(text):
    #return [tok.text for tok in spacy_esp.tokenizer(text)]
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1] ##Reversing as stated in seq2seq paper

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [10]:
SRC = Field(sequential = True, 
            tokenize = tokenize_en, 
            init_token = "<sos>",
            eos_token = "<eos>",
            lower=True,
            batch_first=True)

In [11]:
# !python -m spacy download es_core_news_lg

In [12]:
TRG = Field(sequential = True,
           tokenize=tokenize_de,
           init_token='<sos>',
           eos_token='<eos>',
           lower=True,
           batch_first=True)

In [13]:
datasetfields = [("eng", SRC),("de", TRG)]

In [14]:
train_data, val_data, test_data = Multi30k.splits(exts = ('.en', '.de'), 
                                                    fields = (SRC, TRG))


In [15]:
SRC.build_vocab(train_data, min_freq=1)
TRG.build_vocab(train_data, min_freq=1)

In [16]:
eng_vocab = SRC.vocab
de_vocab = TRG.vocab

In [17]:
batch_size = 32

In [18]:
train_iter, val_iter, test_iter = BucketIterator.splits((train_data, val_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.src))

In [19]:
class Encoder(nn.Module):
    
    def __init__(self, vocab, hidden_dim, encoder_embedding_dim, num_layers, dropout):
        
        super(Encoder, self).__init__() 
        
        self.vocab_size = len(vocab.itos)
        self.embedding_dim = encoder_embedding_dim
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        
        self.emb = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embedding_dim, padding_idx=1)
        
        self.GRU = nn.GRU(self.embedding_dim, hidden_dim, num_layers=self.num_layers, dropout=dropout, batch_first=True)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, sentence):
        
        embedded = self.dropout(self.emb(sentence))
        
        out, hidden_state = self.GRU(embedded)
        
        return out, hidden_state
    
    def initHidden(self, BATCH_SIZE):
        return torch.zeros(self.num_layers, BATCH_SIZE ,self.hidden_dim)


In [20]:
class Decoder(nn.Module):
    
    def __init__(self, vocab, hidden_dim, embedding_dim, num_layers, dropout):
        
        super(Decoder, self).__init__()
        
        self.vocab_size = len(vocab.itos)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        
        self.GRU = nn.GRU(self.embedding_dim, hidden_dim, num_layers=self.num_layers, batch_first=True, dropout=dropout)
        
        self.embedding = nn.Embedding(self.vocab_size, embedding_dim=self.embedding_dim, padding_idx=1)
        
        self.fc = nn.Linear(hidden_dim, self.vocab_size)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, y, hidden_state):
        
        y = self.dropout(self.embedding(y))
        
        lstm_out, decoder_hidden_state = self.GRU(y, hidden_state)
        
        logits = self.fc(lstm_out)
                
        return logits, decoder_hidden_state

In [21]:
class Seq2Seq(nn.Module):
    def __init__(self, hidden_dim, embedding_dim, hidden_layers, english_vocab, german_vocab, dropout):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.hidden_layers = hidden_layers
        
        self.eng_vocab = english_vocab
        self.spa_vocab = german_vocab
        
        self.encoder = Encoder(self.eng_vocab, hidden_dim, embedding_dim, hidden_layers, dropout)
        self.decoder = Decoder(self.spa_vocab, hidden_dim, embedding_dim, hidden_layers, dropout)
        
    
    def forward(self, x, y, teacher_forcing = 0):
        self.encoder.train()
        self.decoder.train()
        
        current_batch_size, max_seq_len = y.shape

        #encoder_hidden = self.encoder.initHidden(current_batch_size).to(device)
        #encoder_output, encoder_hidden = self.encoder.forward(x, encoder_hidden)
        
        encoder_output, encoder_hidden = self.encoder.forward(x)
        decoder_hidden = encoder_hidden
        
        del encoder_hidden
        
        outputs = torch.zeros(size=(max_seq_len - 1, current_batch_size, self.decoder.vocab_size)).to(device)

        prev_word = torch.zeros_like(y[:, 0])

        for i in range(max_seq_len - 1):

            if random.random() < teacher_forcing: #Teacher forcing
                logits, decoder_hidden = self.decoder.forward(y[:, i].unsqueeze(1), decoder_hidden)
            else:
                logits, decoder_hidden = self.decoder.forward(prev_word.unsqueeze(1), decoder_hidden)  #Teacher forcing: Get random then pass i from y if > proba else pass previous scores: TODO

            prev_word = logits.argmax(dim=-1).squeeze(1)

            outputs[i] = logits.squeeze(1)
        
        return outputs

In [22]:
hidden_dim = 2048
hidden_layers = 4
embedding_dim = 2048

dropout = 0.7

In [23]:
model = Seq2Seq(hidden_dim=hidden_dim, embedding_dim=embedding_dim, hidden_layers=hidden_layers, english_vocab=eng_vocab, german_vocab=de_vocab, dropout=dropout).to(device)

In [24]:
def configure_optimizers(model, lr=1e-5, weight_decay=0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

In [25]:
#optimizer = configure_optimizers(model)

lr = 2.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


In [26]:
criterion = nn.CrossEntropyLoss(ignore_index=1)

In [27]:
def train_step(model, optimizer, x, y, teacher_forcing = 0.5, clip=5.0, fp16=True):
    
    #torch.cuda.empty_cache()
    
    model.train()
    
    optimizer.zero_grad()
    
    with autocast(fp16):
        outputs = model.forward(x, y, teacher_forcing)
        loss = criterion(outputs.permute(1, 2, 0), y[:, 1:])
        
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    optimizer.step()
        
    return loss

In [28]:
def evaluate(model, iterator):
    
    model.eval()
    
    eval_loss = 0
    
    with torch.no_grad():
        
        for _, batch in enumerate(iterator):
            
            x, y = batch.src, batch.trg

            x = x.to(device)
            y = y.to(device)

            outputs = model.forward(x, y, teacher_forcing=0)
            loss = criterion(outputs.permute(1, 2, 0), y[:, 1:]) 
            
            eval_loss += loss.item()
            
    return eval_loss / len(iterator)

In [29]:
epochs = 50 # For 10 hours
print_every = 100

fp16=False

In [30]:
best_val_loss = float("inf")

best_model = None


for epoch in range(epochs):
    
    epoch_loss = 0
    
    print("Epoch: {}, Started: {}".format(epoch+1, time.ctime()))
    print("---------------------------------------------------------")

    
    for batch_IDX, batch in enumerate(train_iter):    
        
        x, y = batch.src, batch.trg
        
        x = x.to(device)
        y = y.to(device)

        batch_loss = train_step(model, optimizer, x, y, fp16)

        epoch_loss += batch_loss
        
        #if epoch == 0 or batch_IDX < 3:
        #    torch.cuda.empty_cache()
        
        if batch_IDX % print_every == 0 and batch_IDX != 0:
            print("Epoch: {}, Batch: {},   Batch Loss: {:.4f}".format(epoch+1, batch_IDX, batch_loss))
    
    
    val_loss = evaluate(model, val_iter)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()
    
    print("\n")
    print("Epoch: {}, Mean Epoch Loss: {:.4f}, Valid Loss: {:.4f}, Learning Rate: {:.4f} ".format(epoch+1, epoch_loss / len(train_iter), val_loss, scheduler.get_lr()[0]))
    print("---------------------------------------------------------")
    print("\n")
    
    

Epoch: 1, Started: Mon Dec 21 07:24:41 2020
---------------------------------------------------------
Epoch: 1, Batch: 300,   Batch Loss: 26.7632
Epoch: 1, Batch: 600,   Batch Loss: 16.7126
Epoch: 1, Batch: 900,   Batch Loss: 9.7947


Epoch: 1, Mean Epoch Loss: 24.5180, Valid Loss: 10.2377, Learning Rate: 1.8050 
---------------------------------------------------------


Epoch: 2, Started: Mon Dec 21 07:28:36 2020
---------------------------------------------------------
Epoch: 2, Batch: 300,   Batch Loss: 8.2637
Epoch: 2, Batch: 600,   Batch Loss: 7.6791
Epoch: 2, Batch: 900,   Batch Loss: 6.9834


Epoch: 2, Mean Epoch Loss: 8.0557, Valid Loss: 7.6885, Learning Rate: 1.7147 
---------------------------------------------------------


Epoch: 3, Started: Mon Dec 21 07:32:27 2020
---------------------------------------------------------
Epoch: 3, Batch: 300,   Batch Loss: 6.7800
Epoch: 3, Batch: 600,   Batch Loss: 5.9203
Epoch: 3, Batch: 900,   Batch Loss: 6.1028


Epoch: 3, Mean Epoch 

KeyboardInterrupt: 

In [None]:
def generate_translation(encoder, decoder, sentence, max_len=30):
    
    encoder.eval()
    decoder.eval()
    
    sentence = SRC.tokenize(sentence)
    sentence = [[eng_vocab.stoi.get(token, eng_vocab.unk_index) for token in sentence]]
    sentence = torch.LongTensor(sentence).to(device)
    
    #encoder_hidden = encoder.initHidden(1).to(device)
    #_, encoder_hidden = encoder(sentence, encoder_hidden)
    _, encoder_hidden = encoder(sentence)

    decoder_hidden = encoder_hidden
    
    del encoder_hidden
    
    word = [[2]]
    word = torch.LongTensor(word).to(device)
    
    translation = []
    
    i = 0
    word_str = None
    while i < max_len and word_str != '<eos>':
        
        decoder_out, decoder_hidden = decoder(word, decoder_hidden)
        
        word = decoder_out.argmax(dim=-1)
        word_str = spa_vocab.itos[word.item()]
        translation.append(word_str)
        i += 1
    
    return translation[::-1][1:]

In [None]:
with torch.no_grad():
    sentence = generate_translation(best_model.encoder, best_model.decoder, "please")
    print(' '.join(sentence))

In [None]:
evaluate(model, test_iter)