In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

For this network, it is typical to encode each word as a one-hot vector. For this, I will utilize a Lang class that holds on to the index of each word as it appears. The model will then be trained to find similarities between words to encode them with meaning. This is a compute-heavy process and it may be better to use pre-trained vectors, so I may return and switch to something like word2vec once I complete my initial implementation.

In [2]:
SOS_TOKEN = 0
EOS_TOKEN = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        # used later to replace rare words
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word2count[word] = 1
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

In [3]:
# turn unicode to ascii
def unicodeToAscii(s):
    return ''.join(
        # break word down into its base plus the accent if applicable
        c for c in unicodedata.normalize('NFD', s)
        # only return the chars which are valid roman letters
        if unicodedata.category(c) != 'Mn'
    )

def formatString(s: str):
    s = unicodeToAscii(s.lower().strip())
    # add space before punctuation to treat it like its own token
    s = re.sub(r"([.!?])", r" \1", s)
    # replace any non-tokenized characters with a space so they do not affect the data
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

I usually have trouble speaking the language, but I can understand just fine. For this implementation, I'm going to supplement my learning and see if the model can translate from english to spanish (my weakness) better than I can. However, we will also use a flag to simply reverse the direction at any time.

In [4]:
def readLines(spa_to_eng=False):
    print("Reading lines")
    
    # split each pair into its own element
    lines = open("data/spa-eng/cleaned.txt", encoding='utf-8').read().strip().split('\n')
    # format the strings and store the english to spanish pairs together
    pairs = [[formatString(s) for s in line.split('\t')] for line in lines]
    
    # create language objects for use later
    if spa_to_eng:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang("spa")
        output_lang = Lang("end")
    else:
        input_lang = Lang("eng")
        output_lang = Lang("spa")
    
    return input_lang, output_lang, pairs

Limiting training data for initial passes and to make sure approach works. Will slowly incorporate more data later as I find better/faster ways to train this large model.

In [5]:
MAX_LENGTH = 15

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

# check to make sure a pair start with the above prefixes
def isValidPair(pair):
    source, target = pair
    return (len(source.split(' ')) < MAX_LENGTH
            and len(target.split(' ')) < MAX_LENGTH
            and source.startswith(eng_prefixes) or target.startswith(eng_prefixes))

# apply validity to all pairs
def filterPairs(pairs):
    return [pair for pair in pairs if isValidPair(pair)]

In [6]:
def prepareData(spa_to_eng=False):
    # read in all liens
    input_lang, output_lang, pairs = readLines(spa_to_eng)
    print("Read %s sentence pairs" % len(pairs))
    
    # filter down pairs for easier training
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    # populate the Language objects
    print("Counting words...", '\n')
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData()
for _ in range(5):
    print(random.choice(pairs))
    

Reading lines
Read 142511 sentence pairs
Trimmed to 10491 sentence pairs
Counting words... 

Counted words:
eng 3299
spa 4933
['i m sorry tom i m afraid i can t do that', 'lo siento tom me temo que no puedo hacer eso']
['you re not doing your share', 'no estas cumpliendo con tu parte']
['i m snowed under with work', 'estoy hasta el cuello con el trabajo']
['i m not going to let anybody stop me', 'no voy a permitir que nadie me detenga']
['i m clean', 'soy limpio']


Here, I am using a sequence to sequence model to simulate the many to many relationship necessary for translation. This uses two RNNs, one to encode the input words and one to decode this interpretation as translation output. In a single RNN, every input corresponds to an output, but in a seq2seq model we do not have to worry about the order of the words or the number of words in the sentence. This makes it great for translation because usually two languages will not use the same number of words in the same order for the same phrase. 

## Encoder

The encoder is a RNN that takes the input words and turns all the words into a single point in n dimensional space. Ideally, this vector holds the meaning of the sentence. 

As is typical with a RNN, the encoder outputs hidden state and output vectors, then uses that hidden state again for the next word prediction (hence the Recurrent in Recurrent Neural Networks).

In [7]:
class Encoder(nn.Module):
    '''
    Initialize the encoder:
    input_size= vocabulary size
    hidden_size= size of GRU hidden state and embedding
    dropout_p= probability of dropout
    '''
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        
        # Convert each word token to a size hidden_size dense vector embedding.
        # Shape: (batch_size, seq_length) -> (batch_size, seq_length, hidden_size)
        self.embedding = nn.Embedding(input_size, hidden_size)
        # gated recurrent unit to process context
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        # set embeddings to 0 randomly to prevent overfitting
        self.dropout = nn.Dropout(dropout_p)
    
    '''
    Pass input through the encoder.
    X: shape(batch_size, sequence_length) tensor of token indices
    '''
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        # (batch_size, seq_length, hidden_size), (1, batch_size, hidden_size)
        output, hidden = self.gru(embedded)
        return output, hidden

## Attention Decoder

We are using attention to allow the decoder to focus on different parts of the encoder's outputs at each step instead of relying on the one output vector to carry meaning for the whole sentence. 

We are using Bahadanau attention because it's the one I know more about. Will come back and explore different types of attention later. 

In [8]:
class BahadanauAttention(nn.Module):
    # init attention model
    def __init__(self, hidden_size):
        super(BahadanauAttention, self).__init__()
        # linear layer applied to query (decoder)
        self.Wa = nn.Linear(hidden_size, hidden_size)
        # linear layer applied to keys (encoder)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        # linear layer that produces a scalar attention score
        self.Va = nn.Linear(hidden_size, 1)
        
    def forward(self, query, keys):
        # query: (batch_size, hidden_size)
        # keys: (batch_size, seq_len, hidden_size)
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        # (batch_size, seq_len, 1) -> (batch_size, seq_len) -> (batch_size, 1, seq_len)
        scores = scores.squeeze(2).unsqueeze(1)
        
        # find attention weights accross encoder steps
        weights = F.softmax(scores, dim=-1)
        # batch matmul to find weighted sum of encoder hidden state attention scores
        context = torch.bmm(weights, keys)
        
        return context, weights
    

In [9]:
class AttnDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoder, self).__init__()
        # again takes vocab word and turns it into a hidden_size vector
        self.embedding = nn.Embedding(output_size, hidden_size)
        # using Bahdanau attention
        self.attention = BahadanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        # simply maps gru to correct output size
        self.out = nn.Linear(hidden_size, output_size)
        # regularization
        self.dropout = nn.Dropout(dropout_p)
    
    '''
    Perform computation on one token (step)
    In:
        - input: current index of token (batch_size, 1)
        - hidden: previous hidden state (1, batch_size, hidden_size)
        - encoder_outputs: all encoder states (batch_size, seq_len, hidden_size)
    Out:
        - output: vocab logits for current step (batch_size, 1)
        - hidden: new hidden state
        - attn_weights: attention dist for the step (batch_size, 1, seq_len)
    '''
    def forward_step(self, input, hidden, encoder_outputs):
        # (batch_size, 1) -> (batch_size, 1, hidden_size)
        embedded = self.dropout(self.embedding(input))
        
        # (1, batch_size, hidden_size) -> (batch_size, 1, hidden_size)
        # to match encoder outputs to attention
        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        # (batch_size, 1, 2 * hidden_size)
        input_gru = torch.cat((embedded, context), dim=2)
        
        # output: (batch_size, 1, hidden_size)
        # hidden: (1, batch_size, hidden_size)
        output, hidden = self.gru(input_gru, hidden)
        # projects gru outputs to vocab
        output = self.out(output)
        
        return output, hidden, attn_weights
    
    '''
    Predicts the output sequence of tokens
    In:
        - encoder_outputs: (batch_size, seq_len, hidden_size)
        - encoder_hidden: final encoder hidden state (1, batch_size, hidden_state)
        - target_tensor: (batch_size, max_len) for teacher forcing
    Out:
        - decoder_outputs: encoding of each predicted token in the sequence
        - decoder_hidden: final hidden state
        - attentions: all attention weights for visualization
    '''
    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        
        # initialize with sos start for all token sequences
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_TOKEN)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        # (batch_size, max_decode_len, seq_len)
        attentions = []
        
        for i in range(15):
            # run a single time step of decoding with attention
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # collect outputs and attentions
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)
            
            # Teacher forcing
            # if target tensor is provided, use ground truth otherwise, pick the best prediction as the next input
            if target_tensor is not None:
                # teacher forcing
                decoder_input = target_tensor[:, i].unsqueeze(1)
            else:
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()
        
        
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)
        
        return decoder_outputs, decoder_hidden, attentions

## Training

Separating the outputs into tensor pairs (input and target) of tensors with each word as a different index in the tensor. 

In [10]:
def indexesFromSentence(lang: Lang, sentence: str):
    return [lang.word2index[word] for word in sentence.split(' ')]

# take the index arr and add a dim for batching (batch size 1) = (seqlen,) -> (1, seqLen)
def tensorFromSentence(lang: Lang, sentence: str):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_TOKEN)
    return torch.tensor(indexes, dtype=torch.long, device=device).unsqueeze(0)

# breaks input and target pair into two tensors
def tensorFromPair(pair):
    input_tensor = tensorFromPair(input_lang, pair[0])
    target_tensor = tensorFromPair(output_lang, pair[1])
    return input_tensor, target_tensor

def getDataLoader(batch_size: int):
    # get language objects and all data pairs
    input_lang, output_lang, pairs = prepareData()
    
    # create two np arrays, one for input and one for target of the indexes for each sentence
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    
    for i, (inp, tgt) in enumerate(pairs):
        # get indexes from each sentence
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        
        # append eos to training data
        inp_ids.append(EOS_TOKEN)
        tgt_ids.append(EOS_TOKEN)
        
        # add the new indexed sentence to the collection of inputs and targets
        input_ids[i, :len(inp_ids)] = inp_ids
        target_ids[i, :len(tgt_ids)] = tgt_ids
        
    # create torch dataset utils to process the remaining data
    train_data = TensorDataset(torch.LongTensor(input_ids).to(device), torch.LongTensor(target_ids).to(device))
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader


In [11]:
def train_epoch(dataloader: DataLoader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data
        
        # zero gradients for each pass
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        # teacher forcing on training
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)
        
        # compute loss and perfrom backprop        
        loss = criterion (
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

Helper functions for tracking and analysis

In [12]:
import time
import math

# convert a seconds timestamp to minutes + seconds
def asMinute(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)

# find elapsed seconds along with percentage of epochs complete for remaining seconds
def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / percent
    rs = es - s
    return "%s (-%s)" % (asMinute(s), asMinute(rs))

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.show()

Finally, train the model


In [14]:
def train(train_dataloader, encoder: nn.Module, decoder: nn.Module, num_epochs, learning_rate=0.001, print_every=100, plot_every=100):
    # initialize tracking vars
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    # init training vars
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    
    for epoch in range(1, num_epochs + 1):
        # perform one pass on all data pairs and compute loss
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        # print for tracking and reset loss agg for next pass
        if epoch % print_every == 0:
            print_avg = print_loss_total / print_every
            print_loss_total = 0
            print("%s (%d %d%%) %.4f" % (timeSince(start, epoch / num_epochs),
                                         epoch, epoch / num_epochs * 100, print_avg))
        
        # plot for tracking and reset loss agg for next pass
        if epoch % plot_every == 0:
            plot_avg = plot_loss_total / print_every
            plot_losses.append(plot_avg)
            plot_loss_total = 0
    
    showPlot(plot_losses)

## Evaluate

In [21]:
def evaluate(encoder, decoder, sentence, input_lang: Lang, output_lang: Lang):
    with torch.no_grad():
        # create indexed tensor for outputs
        input_tensor = tensorFromSentence(input_lang, sentence)
        
        # perform computations using encoding and passing into decoder
        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_out, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)
        
        # grab output with the highest probability
        # decoded_ids = decoder_out.argmax(dim=-1)
        _, topi = decoder_out.topk(1)
        decoded_ids = topi.squeeze()
        
        # map each output to a word
        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_TOKEN:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
        
        return decoded_words, decoder_attn

In [16]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

## Running model

In [17]:
hidden_size = 128
batch_size = 32

input_lang, output_lang, train_dataloader = getDataLoader(batch_size)

encoder = Encoder(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoder(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

Reading lines
Read 142511 sentence pairs
Trimmed to 10491 sentence pairs
Counting words... 

Counted words:
eng 3299
spa 4933
1m 26s (-21m 32s) (5 6%) 1.5721
2m 48s (-19m 41s) (10 12%) 0.7835
4m 13s (-18m 16s) (15 18%) 0.4614
5m 38s (-16m 54s) (20 25%) 0.2936
7m 1s (-15m 27s) (25 31%) 0.2059
8m 24s (-14m 1s) (30 37%) 0.1559
55m 14s (-71m 1s) (35 43%) 0.1260
158m 37s (-158m 37s) (40 50%) 0.1062
193m 25s (-150m 26s) (45 56%) 0.0927
258m 41s (-155m 12s) (50 62%) 0.0838
281m 41s (-128m 2s) (55 68%) 0.0771
327m 16s (-109m 5s) (60 75%) 0.0723
346m 9s (-79m 52s) (65 81%) 0.0681
360m 9s (-51m 27s) (70 87%) 0.0651
374m 5s (-24m 56s) (75 93%) 0.0627
382m 7s (-0m 0s) (80 100%) 0.0606


In [22]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> he is a teacher and novelist
= el es profesor y novelista
< el es profesor y novelista <EOS>

> she s not a child
= ella no es una nina
< no es una nina <EOS>

> you re going to die
= vais a morir
< vas a morir <EOS>

> you re stupid
= eres estupido
< eres estupido <EOS>

> he is playing outdoors
= esta jugando fuera
< esta jugando fuera <EOS>

> i m not used to working all night
= no estoy acostumbrado a trabajar toda la noche
< no estoy acostumbrado a trabajar toda la noche <EOS>

> i m happy to be here
= me alegro de estar aqui
< estoy contento de estar aqui aqui <EOS>

> i m sorry but right now i ve got a lot to do
= lo siento pero ahora tengo mucho que hacer
< creo que tengo hemorragia interna <EOS>

> i m going to buy a box of matches
= voy a comprar una caja de cerillas
< voy a comprar una caja de cerillas <EOS>

> i m still doing that
= lo sigo haciendo
< lo sigo haciendo eso <EOS>

