In [0]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [0]:
import spacy

In [0]:
import torchtext
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

In [0]:
seed = 25
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True # To make our results reproduciable elsewhere

In [0]:
!python -m spacy download en
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.1.0/de_core_news_sm-2.1.0.tar.gz#egg=de_core_news_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.1.0/de_core_news_sm-2.1.0.tar.gz (11.1MB)
[K     |████████████████████████████████| 11.1MB 624kB/s 
[?25hBuilding wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.1.0-cp36-none-any.whl size=11073065 sha256=12093b89a1ae8387bf9bfc88f4676ca5e0744439931c82556783bca789448175
  Stored in dire

In [0]:
spacy_german = spacy.load('de')
spacy_english = spacy.load('en')

In [0]:
def german_tokenizer(text):
    r"""
    This create a sequence of german tokens and reverses it
    Note :: Reversing the strings helps our encoder to perform better 
            in Machine Translation
    """
    
    return [token.text for token in spacy_german.tokenizer(text)]

In [0]:
def english_tokenizer(text):
    r"""
    This create a sequence of english tokens
    Note :: We don't the everse the text here because we are giving this 
            to our decoder 
    """
    
    return [token.text for token in spacy_english.tokenizer(text)]

In [0]:
# DE defines the field used for german
DE = torchtext.data.Field(tokenize=german_tokenizer,
                             init_token='<sos>',
                             eos_token = '<eos>',
                             lower=True)

# EN defined the firld used for english
EN = torchtext.data.Field(tokenize=english_tokenizer,
                             init_token='<sos>',
                             eos_token = '<eos>',
                             lower=True)

In [0]:
# The data is been divided into train, validation and test data
# exts (extensions) for the languages are represented with the turple below
# The fields are represented in the vector below
train_data, validation_data, test_data = torchtext.datasets.Multi30k.splits(
                                                            exts = ('.de', '.en'),
                                                            fields = (DE, EN))

training.tar.gz:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 7.01MB/s]
validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 1.80MB/s]


downloading validation.tar.gz
downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 1.69MB/s]


In [0]:
# This builds_vocabs method helps to build word vocabulary and remove 
# the less frequent words 
DE.build_vocab(train_data, min_freq = 2)
EN.build_vocab(train_data, min_freq = 2)

In [0]:
# num_german_word_types is the number of unique words in our corpus or vocabulary
num_german_word_types = len(DE.vocab)

# num_english_word_types is the number of unique words in our corpus or vocabulary
num_english_word_types = len(EN.vocab)

print(f'The number of german wordtypes {num_german_word_types}')
print(f'The number of english wordtypes {num_english_word_types}')

The number of german wordtypes 7855
The number of english wordtypes 5893


In [0]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [0]:
device

device(type='cuda', index=0)

In [0]:
batch_size_ = 128
train_iterator, validation_iterator, test_iterator = torchtext.data.BucketIterator.splits(
                                                                    datasets=(train_data, validation_data, test_data),
                                                                    batch_size = batch_size_,
                                                                    # device uses the gpu if available
                                                                    device=device)

In [0]:
class Encoder(nn.Module):
    
    def __init__(self, 
                 word_types, # Number of unique words in the vocabulary 
                 embed_dim, #This is the embedding dimesion and it's usually 300
                 hidden_size, # This is the number of hidden unit in our elman network
                 number_layers, # This the number of LSTM layers stacked on eachother
                 dropout_ # This is a regularizer
                 ):
        
        super(Encoder, self).__init__()
        # Instancialising the embedding layers
        self.embed = nn.Embedding(word_types, embed_dim)
        # Instancialising the long short term memory
        self.lstm = nn.LSTM(embed_dim, hidden_size, number_layers, dropout=dropout_)
        # Instancialising the dropout layer
        self.dropout = nn.Dropout(dropout_)
        
    def forward(self, source_language):
        
        # The langauge is embedded and to reduce dimensionality and regularizer was aplied
        embeddings = self.dropout(self.embed(source_language))
        ##################################################################
        #                                                                #
        #  From the pytorch documentations we are given this values as:  #
        #  input = (seq_len, batch, input_size)                          #
        #  ::Note:: `num_directions` is 1 here because it's not a BiRNN  #
        #  h_0 = (num_layers * num_directions, batch, hidden_size)       #
        #  c_0 = (num_layers * num_directions, batch, hidden_size)       #
        #  output = (seq_len, batch, num_directions * hidden_size)       #
        #  h_n = (num_layers * num_directions, batch, hidden_size)       #
        #  c_n = (num_layers * num_directions, batch, hidden_size)       #
        #                                                                #
        ##################################################################
        # Note only the input(embeddings) to the LSTM are given hence
        # (h_0, c_0) will default to zeros at the initial time state
        # (h_n, c_n) these are the  hidden and cell state at the next time step
        output, (h_n, c_n) = self.lstm(embeddings)
        
        return h_n, c_n
        

In [0]:
class Decoder(nn.Module):
    
    def __init__(self, 
                 word_types, # Number of unique words in the vocabulary 
                 embed_dim, #This is the embedding dimesion and it's usually 300
                 hidden_size, # This is the number of hidden unit in our elman network
                 number_layers, # This the number of LSTM layers stacked on eachother
                 dropout_, # This is a regularizer
                 output_dim # The out dimension of the classifier
                 ):
        
        super(Decoder, self).__init__()
        # Output dimension
        self.output_dim = output_dim
        # Instancialising the embedding layers
        self.embed = nn.Embedding(word_types, embed_dim)
        # Instancialising the long short term memory
        self.lstm = nn.LSTM(embed_dim, hidden_size, number_layers, dropout=dropout_)
        # Instancialising the dropout layer
        self.dropout = nn.Dropout(dropout_)
        # The classification layer
        self.out = nn.Linear(hidden_size, output_dim)
        
        
    def forward(self, target_language, hidden, cell):
        """
        Arguments:
            target_language: The language we want to translate to and it also the decoder input
            hidden: The source hidden state
            cell: The source cell state
        """
        # target_language [batch_size]
        target_language = target_language.unsqueeze(0) # this gives our target_language a sequence lenght of 1 i.e [1, batch_size]
        
        # The langauge is embedded to reduce dimensionality and regularizer was aplied
        embeddings = self.dropout(self.embed(target_language))
        ##################################################################
        #                                                                #
        #  From the pytorch documentations we are given this values as:  #
        #  input = (seq_len, batch, input_size)                          #
        #  ::Note:: `num_directions` is 1 here because it's not a BiRNN  #
        #  h_0 = (num_layers * num_directions, batch, hidden_size)       #
        #  c_0 = (num_layers * num_directions, batch, hidden_size)       #
        #  output = (seq_len, batch, num_directions * hidden_size)       #
        #  h_n = (num_layers * num_directions, batch, hidden_size)       #
        #  c_n = (num_layers * num_directions, batch, hidden_size)       #
        #                                                                #
        ##################################################################
        # output has a dim => [seq_len=1, batch_size, hidden_size]
        output, (hidden, cell) = self.lstm(embeddings, (hidden, cell))
        # output has a dim => [batch_size, hidden_size]
        output = output.squeeze(0)
        # prediction has a dim => [batch_size, output_dim]
        prediction = self.out(output)
        return prediction, hidden, cell
        

In [0]:
# Sequence to Sequence
class Seq2seq(nn.Module):
    
    def __init__(self, encoder, decoder):
        super(Seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, src_lang, trg_lang, teacher_forcing_ratio=0.5):
        # src_lang as a dim => [seq_len, batch_size]
        # trg_lang as a dim => [seq_len, batch_size]
        # teacher_forcing_ratio is the probability for using the ground truth
        # here we use a value of 0.5 i.e 50% of the time we will be pass in the the ground truth at the next time step
        
        batch_size = trg_lang.shape[1]
        seq_len = trg_lang.shape[0]
        trg_vocab_size = num_english_word_types
        
        #zeros tesor with dim=> [seq_len, batch_size, trg_vocab_size] used for storing the output of the decode 
        decoder_outputs_tensor = torch.zeros(seq_len, batch_size, trg_vocab_size).to(device)
        
        # The context vector is equivalent to the last hidden and cell state of the encoder in the case of LSTM
        hidden, cell = self.encoder(src_lang)
        
        #first input to the decoder is the <sos> token i.e state of sentence token
        decoder_input = trg_lang[0,:]
        
        # We pass each batch of tokens in the sequence of token one after the other till we get to <eos>
        for t in range(1, seq_len):
            decoder_output, decoder_hidden, decoder_cell = self.decoder(decoder_input, hidden, cell)
            decoder_outputs_tensor[t] = decoder_output
            teacher_forcing = np.random.random() < teacher_forcing_ratio
            # torch.max() return a named_tuple of containing(values, indices) of the max value
            top1 = decoder_output.max(1).indices
            # top1 has a dim => [128]
            decoder_input = (trg_lang[t] if teacher_forcing else top1)
            
        return decoder_outputs_tensor
            


In [0]:
# Initializing the encoder
word_types = num_german_word_types
embed_dim = 300
hidden_size = int(1.5*embed_dim)
number_layers = 2
dropout_ = 0.2
encoder = Encoder(word_types, embed_dim, hidden_size, number_layers, dropout_).to(device)

# Initializing the decoder
word_types = num_english_word_types
embed_dim = 300
hidden_size = int(1.5*embed_dim)
number_layers = 2
dropout_ = 0.2
output_dim = num_english_word_types
decoder = Decoder(word_types, embed_dim, hidden_size, number_layers, dropout_, output_dim).to(device)

# Intializing the Seq2seq model
s2s_model = Seq2seq(encoder, decoder).to(device)

# Weights Initailization
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
s2s_model.apply(init_weights)

Seq2seq(
  (encoder): Encoder(
    (embed): Embedding(7855, 300)
    (lstm): LSTM(300, 450, num_layers=2, dropout=0.2)
    (dropout): Dropout(p=0.2)
  )
  (decoder): Decoder(
    (embed): Embedding(5893, 300)
    (lstm): LSTM(300, 450, num_layers=2, dropout=0.2)
    (dropout): Dropout(p=0.2)
    (out): Linear(in_features=450, out_features=5893, bias=True)
  )
)

In [0]:
# Optimizer 
# Here we use the default learing rate lr=1e-3
optimizer = torch.optim.Adam(s2s_model.parameters())

# Loss Function
# DE.vocab.stoi retun an orderdict with key value pairs of word and numerical representation
# PAD_IDX takes the numerical representation for <pad> 
PAD_IDX  = DE.vocab.stoi['<pad>']

# ignore_index (int, optional): Specifies a target value that is ignored
# and does not contribute to the input gradient. When :attr:`size_average` is
#  ``True``, the loss is averaged over non-ignored targets.

criteria = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [0]:
def trainer(model, n_epoch, tran_iter, valid_iter, optim, criterion, clip, model_path):
    
    model.train()
    
    batch_train_loss = 0
    batch_valid_loss = 0
    save_checker = np.inf
    for i in range(1, n_epoch+1):
        
        for batch in tran_iter:

            # src dim => [seq_len, batch_size]
            # trg dim => [seq_len, batch_size]
            src = batch.src
            trg = batch.trg

            optimizer.zero_grad()

            # output = [seq_len, batch_size, len(num_german_word_types)]
            output = model(src, trg)

            # trg dim => [(seq_len - 1)*batch_size]
            trg = trg[1:].view(-1)
            # output dim => [(seq_len - 1)*batch_size, output_dim]
            output = output[1:].view(-1, output_dim)

            train_loss = criterion(output, trg)
            train_loss.backward()
            # We use gradient clip to prevent exploding gradients
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            batch_train_loss = train_loss.item()
            optimizer.step()


            
        with torch.no_grad():
            
            for batch in valid_iter:

                # src dim => [seq_len, batch_size]
                # trg dim => [seq_len, batch_size]
                src = batch.src
                trg = batch.trg

                # output = [seq_len, batch_size, len(num_german_word_types)]
                output = model(src, trg)

                # trg dim => [(seq_len - 1)*batch_size]
                trg = trg[1:].view(-1)
                # output dim => [(seq_len - 1)*batch_size, output_dim]
                output = output[1:].view(-1, output_dim)

                valid_loss = criterion(output, trg)
                batch_valid_loss = valid_loss.item()
                
        print('[{0}/{1}] \t Training Loss: {2} \t Validation Loss {3}'
              .format(i, n_epoch, round(batch_train_loss, 5), round(batch_valid_loss, 5)))
        
        if batch_valid_loss < save_checker:
            torch.save(model.state_dict(), model_path)
            save_checker = batch_valid_loss
            print(".............................SAVING MODEL.............................")
         
             

In [0]:
clip_ = 1
num_epoch = 200
model_path_ = "tranduction_model.pt"
trainer(s2s_model, num_epoch, train_iterator, validation_iterator, optimizer, criteria, clip_, model_path_)

[1/200] 	 Training Loss: 2.56228 	 Validation Loss 4.14235
.............................SAVING MODEL.............................
[2/200] 	 Training Loss: 2.75277 	 Validation Loss 3.95206
.............................SAVING MODEL.............................
[3/200] 	 Training Loss: 2.09442 	 Validation Loss 4.14673
[4/200] 	 Training Loss: 2.3739 	 Validation Loss 3.96643
[5/200] 	 Training Loss: 2.03582 	 Validation Loss 4.03432
[6/200] 	 Training Loss: 2.09294 	 Validation Loss 4.24987
[7/200] 	 Training Loss: 1.93962 	 Validation Loss 4.25913
[8/200] 	 Training Loss: 1.79305 	 Validation Loss 4.2229
[9/200] 	 Training Loss: 1.83787 	 Validation Loss 4.30477
[10/200] 	 Training Loss: 1.82551 	 Validation Loss 4.28059
[11/200] 	 Training Loss: 1.55142 	 Validation Loss 4.31253
[12/200] 	 Training Loss: 1.70666 	 Validation Loss 4.36655
[13/200] 	 Training Loss: 2.05888 	 Validation Loss 4.15026
[14/200] 	 Training Loss: 1.90912 	 Validation Loss 4.35416
[15/200] 	 Training Loss: 1.7