In [51]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [52]:
SOS_TOKEN = 0
EOS_TOKEN = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        # used later to replace rare words
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word2count[word] = 1
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

In [53]:
# turn unicode to ascii
def unicodeToAscii(s):
    return ''.join(
        # break word down into its base plus the accent if applicable
        c for c in unicodedata.normalize('NFD', s)
        # only return the chars which are valid roman letters
        if unicodedata.category(c) != 'Mn'
    )

def formatString(s: str):
    s = unicodeToAscii(s.lower().strip())
    # add space before punctuation to treat it like its own token
    s = re.sub(r"([.!?])", r" \1", s)
    # replace any non-tokenized characters with a space so they do not affect the data
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

I usually have trouble speaking the language, but I can understand just fine. For this implementation, I'm going to supplement my learning and see if the model can translate from english to spanish (my weakness) better than I can

In [54]:
def readLines(spa_to_eng=False):
    print("reading lines")
    
    # split each pair into its own element
    lines = open("data/spa-eng/cleaned.txt", encoding='utf-8').read().strip().split('\n')
    # format the strings and store the english to spanish pairs together
    pairs = [[formatString(s) for s in line.split('\t')] for line in lines]
    
    if spa_to_eng:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang("spa")
        output_lang = Lang("end")
    else:
        input_lang = Lang("eng")
        output_lang = Lang("spa")
    
    return input_lang, output_lang, pairs

Limiting training data for initial passes and to make sure approach works. Will slowly incorporate more data later as I find better/faster ways to train this large model.

In [55]:
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def isValidPair(pair):
    source, target = pair
    return source.startswith(eng_prefixes) or target.startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if isValidPair(pair)]

In [None]:
def prepareData(spa_to_eng=False):
    input_lang, output_lang, pairs = readLines(spa_to_eng)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...", '\n')
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData()
for _ in range(5):
    print(random.choice(pairs))
    

reading lines
Read 142511 sentence pairs
Trimmed to 10584 sentence pairs
Counting words... 

Counted words:
eng 3341
spa 5006
['she s been learning german for a year now', 'ella ha estado aprendiendo aleman por un ano']
['he isn t happy at all', 'el no esta contento en absoluto']
['i m a university student', 'soy universitario']
['he s known for that', 'el es conocido por eso']
['he is younger than me by three years', 'tiene tres anos menos que yo']
['i m from australia', 'soy de australia']
['i m breaking up with my girlfriend tonight', 'esta noche terminare con mi novia']
['i m going to the other side of the world', 'me voy al otro lado del mundo']
['i m boiling water', 'estoy hirviendo agua']
['she is wearing a nice dress now', 'ahora lleva puesto un precioso vestido']
