In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
physical_devices = tf.config.list_physical_devices('GPU') 
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

2.2.0-rc4
sys.version_info(major=3, minor=7, micro=3, releaselevel='final', serial=0)
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# 1. preprocessing data
# 2. build model
# 2.1 encoder
# 2.2 attention
# 2.3 decoder
# 3. evaluation
# 3.1 given sentence, return translated results
# 3.2 visualize results (attention)

In [4]:
en_spa_file_path = './data_spa_en/spa.txt'

import unicodedata
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

en_sentence = 'Why me?'
spa_sentence = '¿Por qué yo?'
print(unicode_to_ascii(en_sentence))
print(unicode_to_ascii(spa_sentence))


Why me?
¿Por que yo?


In [9]:
import re
def preprocess_sentence(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([?.!,¿])",r" \1 ",s)
    s = re.sub(r'[" "]+'," ",s)
    s = re.sub(r'[^a-zA-Z?.!,¿]', " ",s)
    s = s.rstrip().strip()
    
    s = '<start> ' + s + ' <end>'
    return s

print(preprocess_sentence(en_sentence))
print(preprocess_sentence(spa_sentence))

<start> why me ? <end>
<start> ¿ por que yo ? <end>


In [24]:
def parse_data(filename):
    lines = open(filename, encoding='UTF-8').read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]
    preprocessed_sentence_pairs = [
        (preprocess_sentence(en), preprocess_sentence(sp)) for en, sp in sentence_pairs
    ]
    print(preprocessed_sentence_pairs[0])
    return zip(*preprocessed_sentence_pairs)

en_dataset, sp_dataset = parse_data(en_spa_file_path)
print(en_dataset[-1])
print(sp_dataset[-1])

('<start> go . <end>', '<start> ve . <end>')
118964
<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [30]:
def tokenizer(lang):
    lang_tokenizer = keras.preprocessing.text.Tokenizer(
        num_words=None, filters='',split=' '
    )
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    return tensor, lang_tokenizer

input_tensor, input_tokenizer = tokenizer(sp_dataset[0:30000])
output_tensor, output_tokenizer = tokenizer(en_dataset[0:30000])


def max_length(tensor):
    return min(len(t) for t in tensor)

print(input_tensor[0])
max_length_input = max_length(input_tensor)
max_length_output = max_length(output_tensor)

print(max_length_input)
print(max_length_output)


[  1 135   3   2   0   0   0   0   0   0   0   0   0   0   0   0]
16
11


In [15]:
from sklearn.model_selection import train_test_split
input_train, input_eval, output_train, output_eval = train_test_split(input_tensor, output_tensor, test_size=0.2)

len(input_train), len(input_eval), len(output_train), len(output_eval)

(24000, 6000, 24000, 6000)

In [29]:
def convert(example, tokenizer):
    for t in example:
        if t != 0:
            print('%d --> %s' % (t, tokenizer.index_word[t]))

convert(input_train[0], input_tokenizer)
print()
convert(output_train[0], output_tokenizer)

1 --> <start>
37 --> tengo
11 --> que
7099 --> pintarlo
3 --> .
2 --> <end>

1 --> <start>
4 --> i
29 --> have
15 --> to
1343 --> paint
10 --> it
3 --> .
2 --> <end>


In [31]:
def make_dataset(input_tensor, output_tensor, batch_size, epochs, shuffle):
    dataset = tf.data.Dataset.from_tensor_slices((input_tensor, output_tensor))
    if shuffle:
        dataset = dataset.shuffle(30000)
    dataset = dataset.repeat(epochs).batch(batch_size, drop_remainder = True)
    return dataset

batch_size = 64
epochs = 20

train_dataset = make_dataset(input_train, output_train, batch_size, epochs, True)
eval_dataset = make_dataset(input_eval, output_eval, batch_size, 1, False)





In [32]:
for x, y in train_dataset.take(1):
    print(x.shape)
    print(y.shape)
    print(x)
    print(y)
    

(64, 16)
(64, 11)
tf.Tensor(
[[   1  451  229 ...    0    0    0]
 [   1   21 3802 ...    0    0    0]
 [   1    7   21 ...    0    0    0]
 ...
 [   1   22   33 ...    0    0    0]
 [   1   12  132 ...    0    0    0]
 [   1    8   35 ...    0    0    0]], shape=(64, 16), dtype=int32)
tf.Tensor(
[[   1    4  290    4   92  326    3    2    0    0    0]
 [   1    9 2522   49   56    3    2    0    0    0    0]
 [   1   10    8    9 1714    3    2    0    0    0    0]
 [   1   25    6   36  898    7    2    0    0    0    0]
 [   1   20   11   70   79  828    3    2    0    0    0]
 [   1    6   88  374    3    2    0    0    0    0    0]
 [   1   24   28   39    7    2    0    0    0    0    0]
 [   1    4   75   40  715  284    3    2    0    0    0]
 [   1   91  170  144    5    3    2    0    0    0    0]
 [   1   82   25   12   14   53    7    2    0    0    0]
 [   1    4   18  209  261   21  134    3    2    0    0]
 [   1    4   30   12  127 1578    3    2    0    0    0]
 [   1

In [33]:
embedding_units = 256
units = 1024
input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = len(output_tokenizer.word_index) + 1
print(input_vocab_size)

9414


In [22]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_units, encoding_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoding_units = encoding_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_units)
        self.gru = keras.layers.GRU(self.encoding_units, return_sequences = True, return_state = True, 
                                    recurrent_initializer = 'glorot_uniform')
        
    def call(self, x, hidden):
        print(x.shape)
        x = self.embedding(x)
        print(x.shape)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoding_units))

encoder = Encoder(input_vocab_size, embedding_units, units, batch_size)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(x, sample_hidden)
print("sample_output.shape: ", sample_output.shape)
print("sample_hidden.shpae: ", sample_hidden.shape)

(64, 16)
(64, 16, 256)
sample_output.shape:  (64, 16, 1024)
sample_hidden.shpae:  (64, 1024)
