In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

For this network, it is typical to encode each word as a one-hot vector. For this, I will utilize a Lang class that holds on to the index of each word as it appears. The model will then be trained to find similarities between words to encode them with meaning. This is a compute-heavy process and it may be better to use pre-trained vectors, so I may return and switch to something like word2vec once I complete my initial implementation.

In [2]:
SOS_TOKEN = 0
EOS_TOKEN = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        # used later to replace rare words
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word2count[word] = 1
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

In [3]:
# turn unicode to ascii
def unicodeToAscii(s):
    return ''.join(
        # break word down into its base plus the accent if applicable
        c for c in unicodedata.normalize('NFD', s)
        # only return the chars which are valid roman letters
        if unicodedata.category(c) != 'Mn'
    )

def formatString(s: str):
    s = unicodeToAscii(s.lower().strip())
    # add space before punctuation to treat it like its own token
    s = re.sub(r"([.!?])", r" \1", s)
    # replace any non-tokenized characters with a space so they do not affect the data
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

I usually have trouble speaking the language, but I can understand just fine. For this implementation, I'm going to supplement my learning and see if the model can translate from english to spanish (my weakness) better than I can. However, we will also use a flag to simply reverse the direction at any time.

In [4]:
def readLines(spa_to_eng=False):
    print("Reading lines")
    
    # split each pair into its own element
    lines = open("data/spa-eng/cleaned.txt", encoding='utf-8').read().strip().split('\n')
    # format the strings and store the english to spanish pairs together
    pairs = [[formatString(s) for s in line.split('\t')] for line in lines]
    
    # create language objects for use later
    if spa_to_eng:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang("spa")
        output_lang = Lang("end")
    else:
        input_lang = Lang("eng")
        output_lang = Lang("spa")
    
    return input_lang, output_lang, pairs

Limiting training data for initial passes and to make sure approach works. Will slowly incorporate more data later as I find better/faster ways to train this large model.

In [5]:
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

# check to make sure a pair start with the above prefixes
def isValidPair(pair):
    source, target = pair
    return source.startswith(eng_prefixes) or target.startswith(eng_prefixes)

# apply validity to all pairs
def filterPairs(pairs):
    return [pair for pair in pairs if isValidPair(pair)]

In [6]:
def prepareData(spa_to_eng=False):
    # read in all liens
    input_lang, output_lang, pairs = readLines(spa_to_eng)
    print("Read %s sentence pairs" % len(pairs))
    
    # filter down pairs for easier training
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    
    # populate the Language objects
    print("Counting words...", '\n')
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData()
for _ in range(5):
    print(random.choice(pairs))
    

Reading lines
Read 142511 sentence pairs
Trimmed to 10584 sentence pairs
Counting words... 

Counted words:
eng 3341
spa 5006
['i m not the same fool i was fifteen years ago', 'yo no soy el mismo tonto que era hace quince anos']
['you re too drunk', 'sos demasiado borracho']
['we re looking for him', 'lo estamos buscando']
['i m not so sure that was a good idea', 'no estoy tan seguro de que fuera buena idea']
['they re not following me', 'no me estan siguiendo']


Here, I am using a sequence to sequence model to simulate the many to many relationship necessary for translation. This uses two RNNs, one to encode the input words and one to decode this interpretation as translation output. In a single RNN, every input corresponds to an output, but in a seq2seq model we do not have to worry about the order of the words or the number of words in the sentence. This makes it great for translation because usually two languages will not use the same number of words in the same order for the same phrase. 

## Encoder

The encoder is a RNN that takes the input words and turns all the words into a single point in n dimensional space. Ideally, this vector holds the meaning of the sentence. 

As is typical with a RNN, the encoder outputs hidden state and output vectors, then uses that hidden state again for the next word prediction (hence the Recurrent in Recurrent Neural Networks).

In [7]:
class Encoder(nn.Module):
    '''
    Initialize the encoder:
    input_size= vocabulary size
    hidden_size= size of GRU hidden state and embedding
    dropout_p= probability of dropout
    '''
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        
        # Convert each word token to a size hidden_size dense vector embedding.
        # Shape: (batch_size, seq_length) -> (batch_size, seq_length, hidden_size)
        self.embedding = nn.Embedding(input_size, hidden_size)
        # gated recurrent unit to process context
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        # set embeddings to 0 randomly to prevent overfitting
        self.dropout = nn.Dropout(dropout_p)
    
    '''
    Pass input through the encoder.
    X: shape(batch_size, sequence_length) tensor of token indices
    '''
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        # output size: (batch_size, seq_length, hidden_size)
        # hidden size: (1, batch_size, hidden_size)
        output, hidden = self.gru(embedded)
        return output, hidden