In [None]:
import torch.nn as nn
import numpy as np
import torch
import unicodedata
import string
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm_notebook
import re

device = torch.device("cuda")
device

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        # Embedding size of the input.
        self.embed_size = embed_size
        # Each attention is processed in paralel *heads* times.
        # The embedded input size is / nr heads
        self.heads = heads
        self.heads_dim = embed_size // heads
        
        assert (self.heads_dim * heads == embed_size), "Embedding size not divisible by heads"
        
        self.values = nn.Linear(self.heads_dim, self.heads_dim, bias=False) # V * Wv
        self.keys = nn.Linear(self.heads_dim, self.heads_dim, bias=False) # K * Wk
        self.queries = nn.Linear(self.heads_dim, self.heads_dim, bias=False) # Q * Wq
        
        self.fc_out = nn.Linear(heads * self.heads_dim, self.embed_size) # Concatinare rez finale
        
    def forward(self, values, keys, query, mask):
        # self attention can be used in decoder or encoder -> variable size for emb size
        # batch size of the input
        N = query.shape[0]
        
        # Get the sentence size for value, key, query
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        
        # Split the embedding size into the self.heads pieces
        values = values.reshape(N, value_len, self.heads, self.heads_dim)
        keys = keys.reshape(N, key_len, self.heads, self.heads_dim)
        query = query.reshape(N, query_len, self.heads, self.heads_dim)
        
        values = self.values(values)
        keys = self.keys(keys)
        query = self.queries(query)
        
        # query represents the last hidden state
        score = torch.einsum("nqhd,nkhd->nhqk", [query, keys])
        # queries shape : (N, query_len, heads, heads_dim)
        # keys shape : (N, key_len, heads, heads_dim)
        # score : (N, heads, query_len, key_len) -> pentru fiecare cuvant din query ce scor sa dam raportat la fiecare cuvant din inp
        
        if mask is not None:
            # mask is used in the encoder section to padd the EOS tokens
            # in the decoder section is used to not peak the future values in the context of paralelism
            score = score.masked_fill(mask == 0, float("-1e20")) 
        
        attention = torch.softmax(score / (self.embed_size ** (1/2)), dim=3) # normalize the key_len
        
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.heads * self.heads_dim)
        # multiply the attention score (key_len) with the embedded input(value_len)
        # attention shape : (N, heads, query_len, key_len)
        # values shape : (N, value_len, heads, heads_dim)
        # (N, query_len, heads, head_dim)
        
        out = self.fc_out(out)
        
        return out
    

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size) # batchnorm average for each example and normalize
        self.norm2 = nn.LayerNorm(embed_size)
        
        self.feed_forward = nn.Sequential(
                nn.Linear(embed_size, forward_expansion * embed_size),
                nn.ReLU(),
                nn.Linear(forward_expansion * embed_size, embed_size)
        )
        
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        
        # Skip connection the query after the attention 
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        
        return out

In [None]:
class Encoder(nn.Module):
    def __init__(self, src_vocab_size, embed_size, num_layers, heads, device, forward_expasnion, dropout, max_length):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        
        self.layers = nn.ModuleList(
            [
                TransformerBlock(embed_size, heads, dropout=dropout, forward_expansion= forward_expasnion)
            for _ in range(num_layers)] 
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        # N number of batches
        # sequence length for the sentence of each batch
        N, seq_length = x.shape
        
        # Define each position for the sentence (0, 1, .. seq_length) and expand it to the number of batches
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        
        # Embedd the permutation of words 
        # Embedd the words
        # Add the embedded space of the words with the embedded space of the permutation of those words
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        
        for layer in self.layers:
            out = layer(out, out, out, mask)
            
        return out

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        
        return out

In [None]:
class Decoder(nn.Module):
    def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length):
        super(Decoder, self).__init__()
        
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        
        self.layers = nn.ModuleList(
            [DecoderBlock(embed_size, heads, forward_expansion, dropout, device) for _ in range(num_layers)]
        )
        
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x)+ self.position_embedding(positions)))
    
        # x is the last output / hidden layer of the decoder
        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)
        
        out = self.fc_out(x)
        
        return out

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size = 512,
        num_layers = 3,
        forward_expansion = 4,
        heads = 8,
        dropout = 0.10,
        device = 'cuda',
        max_length = 100):
        super(Transformer, self).__init__()

        self.encoder = Encoder(src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_length)
        self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
    
    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        # We dont want the padding tokens to affect the loss
        return src_mask.to(self.device)
    
    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        # make a [seq_length, seq_length] matrix a 0 triangle 
        # [1, 0, 0]
        # [1, 1, 0]
        # [1, 1, 1]
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(N, 1, trg_len, trg_len)
        return trg_mask.to(self.device)
    
    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        # SOFTMAX is performed by the loss function :D
        
        return out

In [None]:
class Util():
    """
        Util class used in order to load the training text.
       The text is loaded from a file.
    """
    
    def __init__(self, file_path, max_len):
        # Max len
        self.max_len = max_len
        # tuple of sentence -> target
        self.data, self.test = self.read_file(file_path)
        # Data dictionary english
        self.word_2_index_in, self.word_2_index_out, self.index_2_word_in, self.index_2_word_out = self.create_dictionary(self.data, self.test)
  
        
    # Turn a Unicode string to plain ASCII, thanks to
    # https://stackoverflow.com/a/518232/2809427
    def unicodeToAscii(self, s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )
    
    def read_file(self, file_path):
        """ 
            Read the training data file and append the sentence -> target
            to the data array.
        """
        
        f = open(file_path, "r",encoding='utf-8')
        lines = f.readlines()
        data = []
        test = []
        
        for i, line in enumerate(lines):
            line = line.split('\t')[0:2]

            sentence = line[0]
            sentence = sentence.split(' ') # split into words
            sentence = [word.lower() for word in sentence] # make all the words lowercase
            sentence = [word.translate(str.maketrans('', '', string.punctuation)) for word in sentence]
            # convert characters to asci and remove non letters
            sentence = [self.unicodeToAscii(word.strip()) for word in sentence]
            sentence = [re.sub(r"([.!?}{])", r" \1", word) for word in sentence]
            sentence = [re.sub(r"[^a-zA-Z.!?}{]+", r"", word) for word in sentence]

            target = line[1]
            target = target.split(' ') # split into words
            target = [word.lower() for word in target] # make all the words lowercase
            target = [word.translate(str.maketrans('', '', string.punctuation)) for word in target]
            # convert characters to asci and remove non letters
            target = [self.unicodeToAscii(word.strip()) for word in target]
            target = [re.sub(r"([.!?}{])", r" \1", word) for word in target]
            target = [re.sub(r"[^a-zA-Z.!?}{]+", r"", word) for word in target]
            
            data.append((sentence,target))
            
        return data, test
    
    def create_dictionary(self, data, test):
        """ 
            Iterate over each sentence in the data and add the characters
            to the dictionary used. Based on it create an inverse dictionary
            that maps the numbers to character.
        """
        word_2_index_in = {}
        word_2_index_out = {}
        index_2_word_in = {}
        index_2_word_out = {}
        
        word_2_index_in['PAD'] = 0
        word_2_index_in["SOS"] = 1
        word_2_index_in["EOS"] = 2
        
        index_2_word_in[0] = 'PAD'
        index_2_word_in[1] = "SOS"
        index_2_word_in[2] = "EOS"
        
        index_2_word_out[0] = 'PAD'
        index_2_word_out[1] = "SOS"
        index_2_word_out[2] = "EOS"
        
        word_2_index_out['PAD'] = 0
        word_2_index_out["SOS"] = 1
        word_2_index_out["EOS"] = 2
        
        for sentence, target in data:
            for word in sentence:
                    if word not in word_2_index_in:
                        word_2_index_in[word] = len(word_2_index_in)
                        index_2_word_in[len(index_2_word_in)] = word
                        
            for word in target:
                    if word not in word_2_index_out:
                        word_2_index_out[word] = len(word_2_index_out)
                        index_2_word_out[len(index_2_word_out)] = word
        
        for sentence, target in test:
            for word in sentence:
                    if word not in word_2_index_in:
                        word_2_index_in[word] = len(word_2_index_in)
                        index_2_word_in[len(index_2_word_in)] = word
                        
            for word in target:
                    if word not in word_2_index_out:
                        word_2_index_out[word] = len(word_2_index_out)
                        index_2_word_out[len(index_2_word_out)] = word
        
        return word_2_index_in, word_2_index_out, index_2_word_in, index_2_word_out
    
    def get_values_input(self, inp):
        """
            Get a normal sentence and transform it to a tensor of dictionary values.
        """
        
        input_tensor = [self.word_2_index_in[word] for word in inp]
        input_tensor.insert(0, 1)
        input_tensor.append(2)
        
        return torch.LongTensor(input_tensor).to(device)
    
    def get_values_out(self, out):
        """
            Get a normal sentence and transform it to a tensor of dictionary values.
        """
        output_tensor = [self.word_2_index_out[word] for word in out]
        output_tensor.insert(0, 1)
        output_tensor.append(2)
        
        return torch.LongTensor(output_tensor).to(device)
    
util = Util("RO-ENG.txt", 65)
util.get_values_input(util.data[0][0])

In [None]:
# Define the encoder and decoder architecture
src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = len(util.word_2_index_in)
trg_vocab_size = len(util.word_2_index_out)
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx).to(
    device
).to(device)
# Define optimizers for each architecture used
transformer_optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Define a criterion to calculate the error
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [None]:
def learn(input_tensor, target_tensor):
    """
        Method used to iterate and train over one sentence.
    """
    
    loss = 0
    transformer_optimizer.zero_grad()
    
    out = model(input_tensor, target_tensor[:, :-1])
    _, topi = out.topk(1)
    print(topi)
    # Expected shape of criterion is (N, softmax_values) for output and (N) for target
    # For the predicted values flatten the batch size and the nr words
    out = out.reshape(-1, out.shape[2])
    
    target_tensor = target_tensor[:, 1:].reshape(-1)
    #out = out[:len(out)-1]
    
    loss += criterion(out, target_tensor)
    loss.backward()
    
    # Clip to avoid exploding gradient issues, makes sure grads are
    # within a healthy range
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    
    
    # Gradient descent step
    transformer_optimizer.step()
    
    return loss.item()


In [None]:
def test_sentence():
    inpts = [torch.tensor([1, 34, 36, 2]).to(device)
          , torch.tensor([1, 161, 162, 2]).to(device)
          , torch.tensor([1, 169, 170, 78, 72,2]).to(device)
          ]
    
    for inp in inpts:
        inp_words = [util.index_2_word_in[i.item()] for i in inp]

        print(inp_words)
        target = [1]
        with torch.no_grad():
            for i in range(len(inp)):
                trg_tensor = torch.LongTensor(target).view(1, -1).to(device)
                prediction = model(inp.view(1, -1), trg_tensor)
                topv, topi = prediction[0][i].topk(1)
                target.append(topi.item())
                if topi.item() == 2:
                    break
        target_words = [util.index_2_word_out[i] for i in target]
        print(target_words)
        print('---')
    
test_sentence()

In [None]:
def train():
    
    NR_EPOCHS = 1500
    for i in range(NR_EPOCHS):
        loss = 0
        for i in tqdm_notebook(range(len(util.data[::64]))):
            batch = util.data[i:i+64]
            input_batch = [x[0] for x in batch]
            target_batch = [x[1] for x in batch]

            sentence = torch.zeros(64, 100).type(torch.LongTensor).to(device)
            target = torch.zeros(64, 100).type(torch.LongTensor).to(device)
            
            for i, x in enumerate(input_batch):
                sentence[i, 0:len(x)+2] = util.get_values_input(x).to(device)
            for i, x in enumerate(target_batch):
                target[i, 0:len(x)+2] = util.get_values_out(x).to(device)
            
            loss += learn(sentence, target)
            #break
        #break
        test_sentence()
        print(loss)
train()

In [None]:
util.word_2_index_in

In [None]:
util.word_2_index_out

In [None]:
for i in range(len(util.data[11000::])):
    print(util.data[i])

In [None]:
def evaluate_model():
    
    prediction_output = []
    
    for batch in tqdm_notebook(util.data[11000:]):
        
        input_batch = batch[0]
        target_batch = batch[1]
        
        sentence = torch.LongTensor([util.word_2_index_in[i] for i in input_batch]).view(1, -1).to(device)
        target_pred = torch.LongTensor([util.word_2_index_out[i] for i in target_batch]).view(1, -1).to(device)
        
        
        with torch.no_grad():
            target = [1] # target starts with SOS
            for i in range(100):
                trg_tensor = torch.LongTensor(target).view(1, -1).to(device)
                prediction = model(sentence.view(1, -1), trg_tensor)
                topv, topi = prediction[0][i].topk(1)
                target.append(topi.item())
                if topi.item() == 2:
                    break
                 
        target_words = [util.index_2_word_out[i] for i in target]
        prediction_output.append((torch.LongTensor(target).view(1, -1).to(device), target_pred))

    return prediction_output

prediction_output = evaluate_model()

In [None]:
from nltk.translate.bleu_score import sentence_bleu

prediction = []
target = []
total_blue = 0
for sent, tar in prediction_output: 
    
    sentence_it = [util.index_2_word_out[word.item()] for word in sent.view(1, -1)[0][1:-1]]
    target_it = [util.index_2_word_out[word.item()] for word in tar.view(1, -1)[0]]
    print(target_it)
    print(sentence_it)
    print('----------')
    total_blue += sentence_bleu([target_it], sentence_it, weights=(0.33, 0.33, 0.33, 0.33))

print(len(prediction_output))
print(total_blue / len(prediction_output))