In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer

from torch.nn.utils.rnn import pad_sequence

from torch.cuda.amp import autocast

In [2]:
import time
import random
import os

In [3]:
import pandas as pd
import numpy as np

In [4]:
SEED = 42069

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
sos = '|<sos>|'
eos = '|<eos>|'
pad = '|<pad>|'
oov = '|<oov>|'

In [6]:
PAD_IDX = 2

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
class Vocab(object):
    
    def __init__(self, VocabList, language):
        
        self.sos = sos
        self.eos = eos
        self.pad = pad
        self.oov = oov
        self.char2idx = {self.sos: 0, self.eos: 1, self.pad: 2, self.oov: 3}
        self.idx2char = [self.sos, self.eos, self.pad, self.oov]
        self.vocab_count = 4
        
        self.language = language
        self.build_tokenizer()
        self.build_vocab(VocabList)
        
        
    def build_tokenizer(self):
        if self.language == "english":
            self.tokenizer = get_tokenizer('spacy', language='en')
        elif self.language == "spanish":
            self.tokenizer = get_tokenizer('spacy', language='es_core_news_lg')   
    
    def tokenize(self, sentence):
        sentence = sentence.lower()
        return self.tokenizer(sentence)

    
    def add_word(self, word):
        if word not in self.char2idx:
            self.char2idx[word] = self.vocab_count
            self.idx2char.append(word)
            self.vocab_count +=1 
    
    def build_vocab(self, vocabList):
        for sentenceList in vocabList:
            sentenceTokens = self.tokenize(sentenceList)
            for token in sentenceTokens:
                self.add_word(token)

In [9]:
class TranslationDataset(Dataset):
    def __init__(self, datapath):
        
        """
        Source: English
        Target: Spanish
        """
        
        self.dataframe = pd.read_csv(datapath)
        self.english = self.dataframe['eng']
        self.spanish = self.dataframe['spa']
        
        self.englishVocab = Vocab(self.english.to_list(), language="english")
        self.spanishVocab = Vocab(self.spanish.to_list(), language="spanish")
        
    def __len__(self):
        return len(self.dataframe)
    
    
    def __getitem__(self, idx):
        
        eng = self.english[idx]
        spa = self.spanish[idx]
        
        eng = self.englishVocab.tokenize(eng)
        spa = self.spanishVocab.tokenize(spa) 
        #spa.reverse() ##Reversing word distance as stated in paper
        
        eng = [self.englishVocab.sos] + eng + [self.englishVocab.eos]
        spa = [self.spanishVocab.sos] + spa + [self.spanishVocab.eos]
        
        eng = torch.LongTensor([self.englishVocab.char2idx[token] for token in eng])
        
        spa = torch.LongTensor([self.spanishVocab.char2idx[token] for token in spa])
                
        return  eng, spa

In [10]:
%%time
dataset = TranslationDataset('./eng-spa.csv')

CPU times: user 7.86 s, sys: 267 ms, total: 8.12 s
Wall time: 8.32 s


In [11]:
def collate(data):
    eng = []
    spa = []
    
    for dat in data:
        eng.append(dat[0])
        spa.append(dat[1])
        
    eng = pad_sequence(eng, padding_value = PAD_IDX, batch_first=True)
    spa = pad_sequence(spa, padding_value = PAD_IDX, batch_first=True)
    
    return eng, spa

In [12]:
BATCH_SIZE = 128

In [13]:
dataloader = DataLoader(dataset, num_workers=18, batch_size=BATCH_SIZE, pin_memory=True, collate_fn=collate, shuffle=True)

In [14]:
class Encoder(nn.Module):
    
    def __init__(self, vocab, hidden_dim, encoder_embedding_dim, num_layers):
        
        super(Encoder, self).__init__() 
        
        self.vocab_size = vocab.vocab_count
        self.embedding_dim = encoder_embedding_dim
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        
        self.emb = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embedding_dim)
        
        self.GRU = nn.GRU(self.embedding_dim, hidden_dim, num_layers=self.num_layers, dropout=0.5, batch_first=True)
        
        
    def forward(self, sentence, encoder_hidden_state):
        
        embedded = self.emb(sentence)
        
        out, hidden_state = self.GRU(embedded, encoder_hidden_state)
        
        return out, hidden_state
    
    def initHidden(self, BATCH_SIZE):
        return torch.zeros(self.num_layers, BATCH_SIZE ,self.hidden_dim)


In [15]:
class Decoder(nn.Module):
    
    def __init__(self, vocab, hidden_dim, embedding_dim, num_layers):
        
        super(Decoder, self).__init__()
        
        self.vocab_size = vocab.vocab_count
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        
        self.GRU = nn.GRU(self.embedding_dim, hidden_dim, num_layers=self.num_layers, batch_first=True, dropout=0.5)
        
        self.embedding = nn.Embedding(self.vocab_size, embedding_dim=self.embedding_dim)
        
        self.fc = nn.Linear(hidden_dim, self.vocab_size)
        
    def forward(self, y, hidden_state):
        
        y = self.embedding(y)
        
        lstm_out, decoder_hidden_state = self.GRU(y, hidden_state)
        
        logits = self.fc(lstm_out)
                
        return logits, decoder_hidden_state

In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, hidden_dim, embedding_dim, hidden_layers, english_vocab, spanish_vocab):
        super().__init__()
        
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.hidden_layers = hidden_layers
        
        self.eng_vocab = english_vocab
        self.spa_vocab = spanish_vocab
        
        self.encoder = Encoder(self.eng_vocab, hidden_dim, embedding_dim, hidden_layers)
        self.decoder = Decoder(self.spa_vocab, hidden_dim, embedding_dim, hidden_layers)
        
    
    def forward(self, x, y, teacher_forcing = 0):
        self.encoder.train()
        self.decoder.train()
        
        current_batch_size, max_seq_len = y.shape

        encoder_hidden = self.encoder.initHidden(current_batch_size).to(device)
        encoder_output, encoder_hidden = self.encoder.forward(x, encoder_hidden)

        decoder_hidden = encoder_hidden
        
        del encoder_hidden
        
        outputs = torch.zeros(size=(max_seq_len - 1, current_batch_size, self.decoder.vocab_size)).to(device)

        prev_word = torch.zeros_like(y[:, 0])

        for i in range(max_seq_len - 1):

            if random.random() < teacher_forcing: #Teacher forcing
                logits, decoder_hidden = self.decoder.forward(y[:, i].unsqueeze(1), decoder_hidden)
            else:
                logits, decoder_hidden = self.decoder.forward(prev_word.unsqueeze(1), decoder_hidden)  #Teacher forcing: Get random then pass i from y if > proba else pass previous scores: TODO

            prev_word = logits.argmax(dim=-1).squeeze(1)

            outputs[i] = logits.squeeze(1)
        
        return outputs

In [17]:
def configure_optimizers(model, lr=1e-5, weight_decay=0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

In [18]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [19]:
model = Seq2Seq(hidden_dim=1024, embedding_dim=1024, hidden_layers=16, english_vocab=dataset.englishVocab, spanish_vocab=dataset.spanishVocab).to(device)

In [20]:
#optimizer = configure_optimizers(model)

lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [21]:
def train_step(model, optimizer, x, y, teacher_forcing = 0.5, clip=5):
    
    model.train()
    
    optimizer.zero_grad()
    
    with autocast():
        outputs = model.forward(x, y, teacher_forcing)
        loss = criterion(outputs.permute(1, 2, 0), y[:, 1:])
        
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    optimizer.step()
        
    return loss

In [22]:
epochs = 5
print_every = 100

In [23]:
for epoch in range(epochs):
    
    epoch_loss = 0
    
    print("Epoch: {}, Started: {}".format(epoch+1, time.ctime()))
    print("---------------------------------------------------------")

    
    for batch_IDX, batch in enumerate(dataloader):    
        
        x, y = batch

        x = x.to(device)
        y = y.to(device)

        batch_loss = train_step(model, optimizer, x, y)

        epoch_loss += batch_loss

        if batch_IDX % print_every == 0:
            print("Epoch: {}, Batch: {},   Batch Loss: {:.4f} ".format(epoch+1, batch_IDX, batch_loss))
    
    scheduler.step()
    
    print("\n")
    print("Epoch: {}, Mean Epoch Loss: {:.4f}".format(epoch+1, epoch_loss / len(dataloader)))
    print("---------------------------------------------------------")
    print("\n")
    


Epoch: 1, Started: Thu Dec 17 07:43:23 2020
---------------------------------------------------------
Epoch: 1, Batch: 0,   Batch Loss: 10.1989 
Epoch: 1, Batch: 100,   Batch Loss: 15.1768 
Epoch: 1, Batch: 200,   Batch Loss: 34.8389 
Epoch: 1, Batch: 300,   Batch Loss: 24.1492 
Epoch: 1, Batch: 400,   Batch Loss: 19.7926 
Epoch: 1, Batch: 500,   Batch Loss: 27.5946 
Epoch: 1, Batch: 600,   Batch Loss: 23.1040 
Epoch: 1, Batch: 700,   Batch Loss: 20.4024 
Epoch: 1, Batch: 800,   Batch Loss: 34.9231 
Epoch: 1, Batch: 900,   Batch Loss: 19.4727 


Epoch: 1, Mean Epoch Loss: 24.3179
---------------------------------------------------------


Epoch: 2, Started: Thu Dec 17 07:49:23 2020
---------------------------------------------------------
Epoch: 2, Batch: 0,   Batch Loss: 20.7104 
Epoch: 2, Batch: 100,   Batch Loss: 17.1363 
Epoch: 2, Batch: 200,   Batch Loss: 13.0389 


KeyboardInterrupt: 

In [26]:
def generate_translation(encoder, decoder, sentence, max_len=50):
    
    sentence = dataset.englishVocab.tokenize(sentence)
    sentence = [[dataset.englishVocab.char2idx.get(token, dataset.englishVocab.char2idx[oov]) for token in sentence]]
    sentence = torch.LongTensor(sentence).to(device)
    
    encoder_hidden = encoder.initHidden(1).to(device)
    _, encoder_hidden = encoder(sentence, encoder_hidden)
    
    decoder_hidden = encoder_hidden
    
    word = [[dataset.spanishVocab.char2idx[sos]]]
    word = torch.LongTensor(word).to(device)
    
    translation = []
    
    i = 0
    word_str = None
    while i < max_len and word_str != eos:
        
        decoder_out, decoder_hidden = decoder(word, decoder_hidden)
        
        word = decoder_out.argmax(dim=-1)
        print(word)
        word_str = dataset.spanishVocab.idx2char[decoder_out.argmax().item()]
        translation.append(word_str)
        i += 1
    
    return translation

In [29]:
with torch.autograd.no_grad():
    print(" ".join(generate_translation(model.encoder, model.decoder, "do you have something to say?")))

tensor([[229]], device='cuda:0')
tensor([[128]], device='cuda:0')
tensor([[121]], device='cuda:0')
tensor([[51]], device='cuda:0')
tensor([[118]], device='cuda:0')
tensor([[1]], device='cuda:0')
el qué la , en |<eos>|
