In [11]:
from io import open
import unicodedata
import string
import re
import random

class Language:
    def __init__(self, name):
        '''
        class to get index2word, word2index, wordcount of a language
        '''
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


In [12]:


def unicodeToAscii(s):
    '''
    to remove all unicodes and foreign language's accent characters
    '''
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def process_sentence(s):
    '''
    remove other chars and lowecase the sentence.
    '''
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [13]:
def readLangs(lang1, lang2, reverse=False):
    dataset = 'data/cat.txt'
    lines = open(dataset, encoding='utf-8').\
        read().strip().split('\n')
    pairs = [[process_sentence(s) for s in l.split('\t')] for l in lines]
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Language(lang2)
        output_lang = Language(lang1)
    else:
        input_lang = Language(lang1)
        output_lang = Language(lang2)
    return input_lang, output_lang, pairs

In [14]:
# I want to use the function with lang1 = eng, lang2 = cat and the dataset = data/cat.txt
input_lang, output_lang, pairs = readLangs('eng', 'cat', False)

In [16]:
pairs

[['wow !', 'carai !'],
 ['wait .', 'espera t .'],
 ['i left .', 'm en vaig anar .'],
 ['really ?', 'de veritat ?'],
 ['thanks !', 'gracies !'],
 ['thanks .', 'gracies !'],
 ['get out !', 'surtiu .'],
 ['get out !', 'fora !'],
 ['get out .', 'surtiu .'],
 ['goodbye !', 'adeu !'],
 ['i agree .', 'hi estic d acord .'],
 ['hurry up .', 'afanya t .'],
 ['too late .', 'massa tard .'],
 ['i ll walk .', 'm anire caminant .'],
 ['thank you .', 'gracies !'],
 ['can i help ?', 'puc ajudar ?'],
 ['i envy him .', 'l envejo .'],
 ['i love you .', 't estimo .'],
 ['that s tom .', 'aixi es en tom .'],
 ['time flies .', 'el temps vola .'],
 ['we saw you .', 'vos vam vore .'],
 ['i like both .', 'm agraden ambdos .'],
 ['i like cats .', 'm agraden els gats'],
 ['i will walk .', 'm anire caminant .'],
 ['i ll try it .', 'la provare .'],
 ['i m too .', 'jo tambe tinc anys .'],
 ['i m a vegan .', 'jo soc vega'],
 ['i m at home .', 'estic a casa .'],
 ['i m stuffed .', 'estic molt tip .'],
 ['make a wish .'

In [22]:
!python train.py --epoch=20000 --lr=0.0005

141543 translation pairs found in dataset.
Reduced dataset to 141437 translation pairs.
No of words in each language:
eng 13751
spa 26844
Starting Training Loop...
1m 38s (- 4m 54s) (5000 25%) 5.3793
3m 24s (- 3m 24s) (10000 50%) 5.1086
5m 10s (- 1m 43s) (15000 75%) 4.9944
7m 0s (- 0m 0s) (20000 100%) 5.0008


In [23]:
!python translate.py --translate_sentence='I am hungry!'

141543 translation pairs found in dataset.
Reduced dataset to 141437 translation pairs.
No of words in each language:
eng 13751
spa 26844
I am hungry!
input = i am hungry !
output =  que que de la ? <EOS>


In [24]:
!python translate.py --translate_sentence='This is my sister!'

141543 translation pairs found in dataset.
Reduced dataset to 141437 translation pairs.
No of words in each language:
eng 13751
spa 26844
This is my sister!
input = this is my sister !
output =  que que que ? ? <EOS>


In [21]:
!python translate.py --translate_sentence='Really?'

1375 translation pairs found in dataset.
Reduced dataset to 1375 translation pairs.
No of words in each language:
eng 1469
cat 1822
Really?
input = really ?
output = que que ? ? <EOS>


In [26]:
#I want to make a validation

!python validate.py --n_iters=1000




python: can't open file 'validate.py': [Errno 2] No such file or directory
