In [4]:
import collections
import helper
import numpy as np
import project_tests as tests
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [6]:
import os


def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')[0:10000]

In [7]:
#english_sentences = helper.load_data('./small_vocab_en')
english_sentences = load_data('./eng')
french_sentences = load_data('./fra')
spanish_sentences = load_data('./spa')
italian_sentences = load_data('./ita')
german_sentences = load_data('./deu')
print('Dataset Loaded')

Dataset Loaded


In [8]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, german_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, italian_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, spanish_sentences[sample_i]))

small_vocab_en Line 1:  Hi.
small_vocab_fr Line 1:  Va !
small_vocab_fr Line 1:  Geh.
small_vocab_fr Line 1:  Ciao!
small_vocab_fr Line 1:  Ve.
small_vocab_en Line 2:  Hi.
small_vocab_fr Line 2:  Marche.
small_vocab_fr Line 2:  Hallo!
small_vocab_fr Line 2:  Ciao.
small_vocab_fr Line 2:  Vete.


In [9]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])
italian_words_counter = collections.Counter([word for sentence in italian_sentences for word in sentence.split()])
german_words_counter = collections.Counter([word for sentence in german_sentences for word in sentence.split()])
spanish_words_counter = collections.Counter([word for sentence in spanish_sentences for word in sentence.split()])
print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print()
print('{} German words.'.format(len([word for sentence in german_sentences for word in sentence.split()])))
print('{} unique german words.'.format(len(german_words_counter)))
print('10 Most common words in the german dataset:')
print('"' + '" "'.join(list(zip(*german_words_counter.most_common(10)))[0]) + '"')
print()
print('{} Italian words.'.format(len([word for sentence in italian_sentences for word in sentence.split()])))
print('{} unique italian words.'.format(len(italian_words_counter)))
print('10 Most common words in the italian dataset:')
print('"' + '" "'.join(list(zip(*italian_words_counter.most_common(10)))[0]) + '"')
print()
print('{} spanish words.'.format(len([word for sentence in spanish_sentences for word in sentence.split()])))
print('{} unique spanish words.'.format(len(spanish_words_counter)))
print('10 Most common words in the spanish dataset:')
print('"' + '" "'.join(list(zip(*spanish_words_counter.most_common(10)))[0]) + '"')
print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')


2009699 English words.
27437 unique English words.
10 Most common words in the English dataset:
"I" "Tom" "to" "you" "the" "a" "is" "in" "I'm" "was"

1649839 German words.
65840 unique german words.
10 Most common words in the german dataset:
"Tom" "Ich" "ist" "nicht" "zu" "Sie" "du" "das" "ich" "die"

1961807 Italian words.
52389 unique italian words.
10 Most common words in the italian dataset:
"Tom" "di" "è" "a" "non" "che" "Io" "Non" "un" "la"

837171 spanish words.
48390 unique spanish words.
10 Most common words in the spanish dataset:
"de" "que" "a" "Tom" "la" "en" "el" "no" "es" "un"

1391967 French words.
49587 unique French words.
10 Most common words in the French dataset:
"de" "Je" "?" "pas" "que" "à" "ne" "la" "le" "Tom"


In [10]:
def tokenize(x):
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [11]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')
tests.test_pad(pad)
# Pad Tokenized output
test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [12]:
def preprocess(x, y, z, a, b):
    preprocess_x, x_tk = tokenize(x)
    preprocess_z, z_tk = tokenize(z)
    preprocess_a, a_tk = tokenize(a)
    preprocess_b, b_tk = tokenize(b)
    preprocess_y, y_tk = tokenize(y)
    preprocess_x = pad(preprocess_x)
    preprocess_a = pad(preprocess_a)
    preprocess_b = pad(preprocess_b)
    preprocess_z = pad(preprocess_z)
    preprocess_y = pad(preprocess_y)
# Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    preprocess_a = preprocess_a.reshape(*preprocess_a.shape, 1)
    preprocess_b = preprocess_b.reshape(*preprocess_b.shape, 1)
    preprocess_z = preprocess_z.reshape(*preprocess_z.shape, 1)
    return preprocess_x, preprocess_y, preprocess_a, preprocess_b, preprocess_z, x_tk, y_tk, a_tk, b_tk, z_tk
preproc_english_sentences, preproc_french_sentences, preproc_spanish_sentences, preproc_german_sentences, preproc_italian_sentences, english_tokenizer, french_tokenizer,spanish_tokenizer,german_tokenizer,italian_tokenizer =\
    preprocess(english_sentences, french_sentences,spanish_sentences,german_sentences,italian_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
max_spanish_sequence_length = preproc_spanish_sentences.shape[1]
max_german_sequence_length = preproc_german_sentences.shape[1]
max_italian_sequence_length = preproc_italian_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)
spanish_vocab_size = len(spanish_tokenizer.word_index)
german_vocab_size = len(german_tokenizer.word_index)
italian_vocab_size = len(italian_tokenizer.word_index)
print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("Max spanish sentence length:", max_spanish_sequence_length)
print("Max german sentence length:", max_german_sequence_length)
print("Max italian sentence length:", max_italian_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)
print("spanish vocabulary size:", spanish_vocab_size)
print("german vocabulary size:", german_vocab_size)
print("italian vocabulary size:", italian_vocab_size)

Data Preprocessed
Max English sentence length: 101
Max French sentence length: 55
Max spanish sentence length: 77
Max german sentence length: 92
Max italian sentence length: 68
English vocabulary size: 14106
French vocabulary size: 34186
spanish vocabulary size: 38410
german vocabulary size: 28879
italian vocabulary size: 28967


In [13]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
print('`logits_to_text` function loaded.')  

`logits_to_text` function loaded.


In [16]:
def simple_modelfr(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = 'sparse_categorical_crossentropy', 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model

#tests.test_simple_model(simple_model)
tmp_x = pad(preproc_english_sentences[:len(preproc_french_sentences)], max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
#tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2], 1))
# Train the neural network
print(len(preproc_french_sentences))
print(tmp_x.shape,preproc_french_sentences.shape)
simple_rnn_model_fr = simple_modelfr(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
simple_rnn_model_fr.fit(tmp_x, preproc_french_sentences, epochs=2, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(simple_rnn_model_fr.predict(tmp_x[:1])[0], french_tokenizer))
simple_rnn_model_fr.save('simple_rnn_model_fr.h5')

208907
(208907, 55, 1) (208907, 55, 1)
Epoch 1/2
Epoch 2/2
je ne pas <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [17]:
def simple_modelde(input_shape, output_sequence_length, english_vocab_size, german_vocab_size):
    learning_rate = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(german_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = 'sparse_categorical_crossentropy', 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model

#tests.test_simple_model(simple_model)
tmp_x = pad(preproc_english_sentences[:len(preproc_german_sentences)], max_german_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_german_sentences.shape[-2], 1))
#tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2], 1))
# Train the neural network
print(len(preproc_german_sentences))
print(tmp_x.shape,preproc_german_sentences.shape)
simple_rnn_model_de = simple_modelde(
    tmp_x.shape,
    max_german_sequence_length,
    english_vocab_size,
    german_vocab_size)
simple_rnn_model_de.fit(tmp_x, preproc_german_sentences, epochs=2, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(simple_rnn_model_fr.predict(tmp_x[:1])[0], german_tokenizer))
simple_rnn_model_de.save('simple_rnn_model_de.h5')

364201
(364201, 92, 1) (364201, 92, 1)
Epoch 1/2
Epoch 2/2
tom la è <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [18]:
print(logits_to_text(simple_rnn_model_de.predict(tmp_x[:1])[0], german_tokenizer))

tom non <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [19]:
def simple_modelit(input_shape, output_sequence_length, english_vocab_size, italian_vocab_size):
    learning_rate = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(italian_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = 'sparse_categorical_crossentropy', 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model

#tests.test_simple_model(simple_model)
tmp_x = pad(preproc_english_sentences[:len(preproc_italian_sentences)], max_italian_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_italian_sentences.shape[-2], 1))
#tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2], 1))
# Train the neural network
print(len(preproc_italian_sentences))
print(tmp_x.shape,preproc_italian_sentences.shape)
simple_rnn_model_it = simple_modelit(
    tmp_x.shape,
    max_italian_sequence_length,
    english_vocab_size,
    italian_vocab_size)
simple_rnn_model_it.fit(tmp_x, preproc_italian_sentences, epochs=2, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(simple_rnn_model_it.predict(tmp_x[:1])[0], italian_tokenizer))
simple_rnn_model_it.save('simple_rnn_model_it.h5')

139706
(139706, 68, 1) (139706, 68, 1)
Epoch 1/2
Epoch 2/2
tom es <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [20]:
def simple_modelsp(input_shape, output_sequence_length, english_vocab_size, spanish_vocab_size):
    learning_rate = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(spanish_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = 'sparse_categorical_crossentropy', 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model

#tests.test_simple_model(simple_model)
tmp_x = pad(preproc_english_sentences[:len(preproc_spanish_sentences)], max_spanish_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_spanish_sentences.shape[-2], 1))
#tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[-2], 1))
# Train the neural network
print(len(preproc_spanish_sentences))
print(tmp_x.shape,preproc_spanish_sentences.shape)
simple_rnn_model_sp = simple_modelsp(
    tmp_x.shape,
    max_spanish_sequence_length,
    english_vocab_size,
    spanish_vocab_size)
simple_rnn_model_sp.fit(tmp_x, preproc_spanish_sentences, epochs=2, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(simple_rnn_model_sp.predict(tmp_x[:1])[0], spanish_tokenizer))
simple_rnn_model_sp.save('simple_rnn_model_sp.h5')

260435
(260435, 77, 1) (260435, 77, 1)
Epoch 1/2
Epoch 2/2




ich ist nicht <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [22]:
from keras.models import Sequential
def embed_modelfr(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 1e-3
    rnn = GRU(64, return_sequences=True, activation="tanh")
    
    embedding = Embedding(french_vocab_size, 64, input_length=input_shape[1]) 
    logits = TimeDistributed(Dense(french_vocab_size, activation="softmax"))
    
    model = Sequential()
    #em can only be used in first layer --> Keras Documentation
    model.add(embedding)
    model.add(rnn)
    model.add(logits)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model
#tests.test_embed_model(embed_model)
# tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))
tmp_x = pad(preproc_english_sentences[:len(preproc_french_sentences)], max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
embeded_model_fr = embed_modelfr(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
embeded_model_fr.fit(tmp_x, preproc_french_sentences, epochs=2, validation_split=0.2)
print(logits_to_text(embeded_model_fr.predict(tmp_x[:1])[0], french_tokenizer))
embeded_model_fr.save('embeded_model_fr.h5')

Epoch 1/2
Epoch 2/2




je ne pas <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [23]:
from keras.models import Sequential
def embed_modelde(input_shape, output_sequence_length, english_vocab_size, german_vocab_size):
    learning_rate = 1e-3
    rnn = GRU(64, return_sequences=True, activation="tanh")
    
    embedding = Embedding(german_vocab_size, 64, input_length=input_shape[1]) 
    logits = TimeDistributed(Dense(german_vocab_size, activation="softmax"))
    
    model = Sequential()
    #em can only be used in first layer --> Keras Documentation
    model.add(embedding)
    model.add(rnn)
    model.add(logits)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model
#tests.test_embed_model(embed_model)
# tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))
tmp_x = pad(preproc_english_sentences[:len(preproc_german_sentences)], max_german_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_german_sentences.shape[-2], 1))
embeded_model_de = embed_modelde(
    tmp_x.shape,
    max_german_sequence_length,
    english_vocab_size,
    german_vocab_size)
embeded_model_de.fit(tmp_x, preproc_german_sentences, epochs=2, validation_split=0.2)
print(logits_to_text(embeded_model_de.predict(tmp_x[:1])[0], german_tokenizer))
embeded_model_de.save('embeded_model_de.h5')

Epoch 1/2
Epoch 2/2
tom è <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [25]:
from keras.models import Sequential
def embed_modelsp(input_shape, output_sequence_length, english_vocab_size, spanish_vocab_size):
    learning_rate = 1e-3
    rnn = GRU(64, return_sequences=True, activation="tanh")
    
    embedding = Embedding(spanish_vocab_size, 64, input_length=input_shape[1]) 
    logits = TimeDistributed(Dense(spanish_vocab_size, activation="softmax"))
    
    model = Sequential()
    #em can only be used in first layer --> Keras Documentation
    model.add(embedding)
    model.add(rnn)
    model.add(logits)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model
#tests.test_embed_model(embed_model)
# tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))
tmp_x = pad(preproc_english_sentences[:len(preproc_spanish_sentences)], max_spanish_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_spanish_sentences.shape[-2], 1))
embeded_model_sp = embed_modelsp(
    tmp_x.shape,
    max_spanish_sequence_length,
    english_vocab_size,
    spanish_vocab_size)
embeded_model_sp.fit(tmp_x, preproc_spanish_sentences, epochs=2, validation_split=0.2)
print(logits_to_text(embeded_model_sp.predict(tmp_x[:1])[0], spanish_tokenizer))
embeded_model_sp.save('embeded_model_sp.h5')

Epoch 1/2
Epoch 2/2
ich ist nicht <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [26]:
from keras.models import Sequential
def embed_modelit(input_shape, output_sequence_length, english_vocab_size, italian_vocab_size):
    learning_rate = 1e-3
    rnn = GRU(64, return_sequences=True, activation="tanh")
    
    embedding = Embedding(italian_vocab_size, 64, input_length=input_shape[1]) 
    logits = TimeDistributed(Dense(italian_vocab_size, activation="softmax"))
    
    model = Sequential()
    #em can only be used in first layer --> Keras Documentation
    model.add(embedding)
    model.add(rnn)
    model.add(logits)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model
#tests.test_embed_model(embed_model)
# tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))
tmp_x = pad(preproc_english_sentences[:len(preproc_italian_sentences)], max_italian_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_italian_sentences.shape[-2], 1))
embeded_model_it = embed_modelit(
    tmp_x.shape,
    max_italian_sequence_length,
    english_vocab_size,
    italian_vocab_size)
embeded_model_it.fit(tmp_x, preproc_italian_sentences, epochs=2, validation_split=0.2)
print(logits_to_text(embeded_model_it.predict(tmp_x[:1])[0], italian_tokenizer))
embeded_model_it.save('embeded_model_it.h5')

Epoch 1/2
Epoch 2/2
tom es <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
def bd_modelfr(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
   
    learning_rate = 1e-3
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences = True, dropout = 0.1), 
                           input_shape = input_shape[1:]))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model
#tests.test_bd_model(bd_model)
#tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
tmp_x = pad(preproc_english_sentences[:len(preproc_french_sentences)],  preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
bidi_model_fr = bd_modelfr(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)
bidi_model_fr.fit(tmp_x, preproc_french_sentences,epochs=2, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(bidi_model_fr.predict(tmp_x[:1])[0], french_tokenizer))
bidi_model_fr.save('bidi_model_fr.h5')

Epoch 1/2

In [None]:
def bd_modelit(input_shape, output_sequence_length, english_vocab_size, italian_vocab_size):
   
    learning_rate = 1e-3
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences = True, dropout = 0.1), 
                           input_shape = input_shape[1:]))
    model.add(TimeDistributed(Dense(italian_vocab_size, activation = 'softmax')))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model
#tests.test_bd_model(bd_model)
#tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
tmp_x = pad(preproc_english_sentences[:len(preproc_italian_sentences)],  preproc_italian_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_italian_sentences.shape[-2], 1))
bidi_model_it = bd_modelit(
    tmp_x.shape,
    preproc_italian_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(italian_tokenizer.word_index)+1)
bidi_model_it.fit(tmp_x, preproc_italian_sentences,epochs=2, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(bidi_model_it.predict(tmp_x[:1])[0], italian_tokenizer))
bidi_model_it.save('bidi_model_it.h5')

In [None]:
def bd_modelde(input_shape, output_sequence_length, english_vocab_size, german_vocab_size):
   
    learning_rate = 1e-3
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences = True, dropout = 0.1), 
                           input_shape = input_shape[1:]))
    model.add(TimeDistributed(Dense(german_vocab_size, activation = 'softmax')))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model
#tests.test_bd_model(bd_model)
#tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
tmp_x = pad(preproc_english_sentences[:len(preproc_german_sentences)],  preproc_german_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_german_sentences.shape[-2], 1))
bidi_model_de = bd_modelde(
    tmp_x.shape,
    preproc_german_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(german_tokenizer.word_index)+1)
bidi_model_de.fit(tmp_x, preproc_german_sentences,epochs=2, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(bidi_model_de.predict(tmp_x[:1])[0], german_tokenizer))
bidi_model_de.save('bidi_model_de.h5')

In [None]:
def bd_modelsp(input_shape, output_sequence_length, english_vocab_size, spanish_vocab_size):
   
    learning_rate = 1e-3
    model = Sequential()
    model.add(Bidirectional(GRU(128, return_sequences = True, dropout = 0.1), 
                           input_shape = input_shape[1:]))
    model.add(TimeDistributed(Dense(spanish_vocab_size, activation = 'softmax')))
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model
#tests.test_bd_model(bd_model)
#tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
# tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))
tmp_x = pad(preproc_english_sentences[:len(preproc_spanish_sentences)],  preproc_spanish_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_spanish_sentences.shape[-2], 1))
bidi_model_sp = bd_modelsp(
    tmp_x.shape,
    preproc_spanish_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(spanish_tokenizer.word_index)+1)
bidi_model_sp.fit(tmp_x, preproc_spanish_sentences,epochs=2, validation_split=0.2)
# Print prediction(s)
print(logits_to_text(bidi_model_sp.predict(tmp_x[:1])[0], spanish_tokenizer))
bidi_model_sp.save('bidi_model_sp.h5')

In [None]:
def encdec_modelfr(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
  
    learning_rate = 1e-3
    model = Sequential()
    model.add(GRU(128, input_shape = input_shape[1:], return_sequences = False))
    model.add(RepeatVector(output_sequence_length))
    model.add(GRU(128, return_sequences = True))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model
#tests.test_encdec_model(encdec_model)
tmp_x = pad(preproc_english_sentences[:len(preproc_french_sentences)])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[1], 1))
encodeco_model_fr = encdec_modelfr(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)
encodeco_model_fr.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)
print(logits_to_text(encodeco_model_fr.predict(tmp_x[:1])[0], french_tokenizer))
encodeco_model_fr.save('encodeco_model_fr.h5')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
new jersey est jamais agréable en mois mais il est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
def encdec_modelsp(input_shape, output_sequence_length, english_vocab_size, spanish_vocab_size):
  
    learning_rate = 1e-3
    model = Sequential()
    model.add(GRU(128, input_shape = input_shape[1:], return_sequences = False))
    model.add(RepeatVector(output_sequence_length))
    model.add(GRU(128, return_sequences = True))
    model.add(TimeDistributed(Dense(spanish_vocab_size, activation = 'softmax')))
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model
#tests.test_encdec_model(encdec_model)
tmp_x = pad(preproc_english_sentences[:len(preproc_spanish_sentences)])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[1], 1))
encodeco_model_sp = encdec_modelsp(
    tmp_x.shape,
    preproc_spanish_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(spanish_tokenizer.word_index)+1)
encodeco_model_sp.fit(tmp_x, preproc_spanish_sentences, batch_size=1024, epochs=20, validation_split=0.2)
print(logits_to_text(encodeco_model_sp.predict(tmp_x[:1])[0], spanish_tokenizer))
encodeco_model_sp.save('encodeco_model_sp.h5')

In [None]:
def encdec_modelde(input_shape, output_sequence_length, english_vocab_size, german_vocab_size):
  
    learning_rate = 1e-3
    model = Sequential()
    model.add(GRU(128, input_shape = input_shape[1:], return_sequences = False))
    model.add(RepeatVector(output_sequence_length))
    model.add(GRU(128, return_sequences = True))
    model.add(TimeDistributed(Dense(german_vocab_size, activation = 'softmax')))
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model
#tests.test_encdec_model(encdec_model)
tmp_x = pad(preproc_english_sentences[:len(preproc_german_sentences)])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[1], 1))
encodeco_model_de = encdec_modelde(
    tmp_x.shape,
    preproc_german_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(german_tokenizer.word_index)+1)
encodeco_model_de.fit(tmp_x, preproc_german_sentences, batch_size=1024, epochs=20, validation_split=0.2)
print(logits_to_text(encodeco_model_de.predict(tmp_x[:1])[0], german_tokenizer))
encodeco_model_de.save('encodeco_model_de.h5')

In [None]:
def encdec_modelit(input_shape, output_sequence_length, english_vocab_size, italian_vocab_size):
  
    learning_rate = 1e-3
    model = Sequential()
    model.add(GRU(128, input_shape = input_shape[1:], return_sequences = False))
    model.add(RepeatVector(output_sequence_length))
    model.add(GRU(128, return_sequences = True))
    model.add(TimeDistributed(Dense(italian_vocab_size, activation = 'softmax')))
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    return model
#tests.test_encdec_model(encdec_model)
tmp_x = pad(preproc_english_sentences[:len(preproc_italian_sentences)])
tmp_x = tmp_x.reshape((-1, preproc_english_sentences.shape[1], 1))
encodeco_model_it = encdec_modelit(
    tmp_x.shape,
    preproc_italian_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(italian_tokenizer.word_index)+1)
encodeco_model_it.fit(tmp_x, preproc_italian_sentences, batch_size=1024, epochs=20, validation_split=0.2)
print(logits_to_text(encodeco_model_it.predict(tmp_x[:1])[0], italian_tokenizer))
encodeco_model_it.save('encodeco_model_it.h5')

In [None]:
def model_finalfr(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
  
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size,output_dim=128,input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256,return_sequences=False)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(french_vocab_size,activation='softmax')))
    learning_rate = 0.005
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model
#tests.test_model_final(model_final)
print('Final Model Loaded')

Final Model Loaded


In [None]:
def model_finalde(input_shape, output_sequence_length, english_vocab_size, german_vocab_size):
  
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size,output_dim=128,input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256,return_sequences=False)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(german_vocab_size,activation='softmax')))
    learning_rate = 0.005
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model
#tests.test_model_final(model_final)
print('Final Model Loaded')

In [None]:
def model_finalsp(input_shape, output_sequence_length, english_vocab_size, spanish_vocab_size):
  
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size,output_dim=128,input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256,return_sequences=False)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(spanish_vocab_size,activation='softmax')))
    learning_rate = 0.005
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model
#tests.test_model_final(model_final)
print('Final Model Loaded')

In [None]:
def model_finalit(input_shape, output_sequence_length, english_vocab_size, italian_vocab_size):
  
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size,output_dim=128,input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256,return_sequences=False)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(italian_vocab_size,activation='softmax')))
    learning_rate = 0.005
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model
#tests.test_model_final(model_final)
print('Final Model Loaded')

In [None]:
def final_predictions(x, y, a, b, z, x_tk, y_tk, a_tk, b_tk, z_tk):
    tmp_X = pad(preproc_english_sentences[:len(preproc_french_sentences)])
    modelfr = model_finalfr(tmp_X.shape,
                        preproc_french_sentences.shape[1],
                        len(english_tokenizer.word_index)+1,
                        len(french_tokenizer.word_index)+1)
    modelde = model_finalde(tmp_X.shape,
                        preproc_german_sentences.shape[1],
                        len(english_tokenizer.word_index)+1,
                        len(german_tokenizer.word_index)+1)
    modelit = model_finalit(tmp_X.shape,
                        preproc_italian_sentences.shape[1],
                        len(english_tokenizer.word_index)+1,
                        len(italian_tokenizer.word_index)+1)
    modelsp = model_finalsp(tmp_X.shape,
                        preproc_spanish_sentences.shape[1],
                        len(english_tokenizer.word_index)+1,
                        len(spanish_tokenizer.word_index)+1)
    
    modelfr.fit(tmp_X, preproc_french_sentences, epochs = 2, validation_split = 0.2)
    modelde.fit(tmp_X, preproc_german_sentences, epochs = 2, validation_split = 0.2)
    modelit.fit(tmp_X, preproc_italian_sentences, epochs = 2, validation_split = 0.2)
    modelsp.fit(tmp_X, preproc_spanish_sentences, epochs = 2, validation_split = 0.2)
    modelfr.save('modelfr.h5')
    modelit.save('modelit.h5')
    modelde.save('modelde.h5')
    modelsp.save('modelsp.h5')
 
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    x_id_to_word = {value: key for key, value in x_tk.word_index.items()}
    x_id_to_word[0] = '<PAD>'
    sentence = 'he saw a old yellow truck'
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictionsfr = modelfr.predict(sentences, len(sentences))
    predictionsit = modelit.predict(sentences, len(sentences))
    predictionssp = modelsp.predict(sentences, len(sentences))
    predictionsde = modelde.predict(sentences, len(sentences))
    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictionsfr[0]]))
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictionssp[0]]))
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictionsde[0]]))
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictionsit[0]]))
    print('Il a vu un vieux camion jaune')
    # print('Sample 2:')
    # print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    # print(' '.join([x_id_to_word[np.max(x)] for x in y[0]]))
    #print(predictions)

final_predictions(preproc_english_sentences, preproc_french_sentences, preproc_spanish_sentences, preproc_german_sentences, preproc_italian_sentences, english_tokenizer, french_tokenizer, spanish_tokenizer, german_tokenizer, italian_tokenizer)

Epoch 1/2
Epoch 2/2
Sample 1:
il a vu un camion camion camion <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Il a vu un vieux camion jaune
Sample 2:
new jersey est parfois calme pendant cours et il est il est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
june fall is sometimes quiet winter favorite france but it is yellow in freezing <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
