In [24]:
import re
import numpy as np
import pandas as pd
from string import digits
from sklearn.model_selection import train_test_split

from keras import Model
from keras.layers import Dropout, Input, Embedding, LSTM, Dense

In [3]:
en_dataset = pd.read_csv('small_vocab_en.csv', header=None, error_bad_lines=False, sep='/n', nrows=15000)
fr_dataset = pd.read_csv('small_vocab_fr.csv', header=None, error_bad_lines=False, sep='/n', nrows=15000)

  """Entry point for launching an IPython kernel.
  


In [4]:
en_dataset.rename({0:'text'}, inplace=True, axis=1)
fr_dataset.rename({0:'text'}, inplace=True, axis=1)


In [5]:
remove_digits =  str.maketrans('', '', digits) 

en_dataset['text'] = en_dataset['text'].apply(lambda x: x.lower())
fr_dataset['text'] = fr_dataset['text'].apply(lambda x: x.lower())

en_dataset['text'] = en_dataset['text'].apply(lambda x: re.sub("[^\w\s]", "",x))
fr_dataset['text'] = fr_dataset['text'].apply(lambda x: re.sub("[^\w\s]", "",x))

en_dataset['text'] = en_dataset['text'].apply(lambda x: x.translate(remove_digits))
fr_dataset['text'] = fr_dataset['text'].apply(lambda x: x.translate(remove_digits))

en_dataset['text'] = en_dataset['text'].apply(lambda x: x.strip())
fr_dataset['text'] = fr_dataset['text'].apply(lambda x: x.strip())

en_dataset['text'] = en_dataset['text'].apply(lambda x: '<SOS> ' + x + ' <EOS>')
fr_dataset['text'] = fr_dataset['text'].apply(lambda x: '<SOS> ' + x + ' <EOS>')

In [6]:
word_to_index_en = {}
index_to_word_en = {}
word_index = 0
for sentence in en_dataset['text'].values:
  for word in sentence.split():
    if word not in word_to_index_en.keys():
      word_to_index_en[word] = word_index
      index_to_word_en[word_index] = word
      word_index += 1


In [7]:
max_len_en = max([len(sentence) for sentence in en_dataset['text']])

In [8]:
word_to_index_fr = {}
index_to_word_fr = {}
word_index = 0
for sentence in fr_dataset['text'].values:
  for word in sentence.split():
    if word not in word_to_index_fr.keys():
      word_to_index_fr[word] = word_index
      index_to_word_fr[word_index] = word
      word_index += 1

In [9]:
max_len_fr = max([len(sentence) for sentence in fr_dataset['text']])

In [10]:
num_encoder_tokens = len(word_to_index_en.keys())
num_decoder_tokens = len(word_to_index_fr.keys())

In [11]:
X = en_dataset['text'].values
Y = fr_dataset['text'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1)

In [14]:

encoder_input = np.zeros((X_train.shape[0], max_len_en, num_encoder_tokens), dtype='float32')
decoder_input = np.zeros((X_train.shape[0], max_len_fr, num_decoder_tokens), dtype='float32')
decoder_target = np.zeros((X_train.shape[0], max_len_fr, num_decoder_tokens), dtype='float32')

for j, (input_sentence, target_sentence) in enumerate(zip(X_train, Y_train)):
    for pos, word in enumerate(input_sentence.split()):
      encoder_input[j, pos, word_to_index_en[word]] = 1.0
    encoder_input[j, pos+1, word_to_index_en['<EOS>']] = 1.0

    for pos, word in enumerate(target_sentence.split()):
      decoder_input[j, pos, word_to_index_fr[word]] = 1.0

      if pos > 0:
        decoder_target[j, pos-1, word_to_index_fr[word]] = 1.0

    decoder_input[j, pos, word_to_index_en['<EOS>']] = 1.0
    decoder_target[j, pos:, word_to_index_en['<EOS>']] = 1.0

In [25]:
latent_dim=64
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [26]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [None]:
model.compile(
    optimizer="rmsprop", loss='categorical_crossentropy', metrics=['accuracy']
)

model.fit(
    [encoder_input, decoder_input],
    decoder_target,
    batch_size=64,
    epochs=100,
    validation_split=0.2,
)

# Save model
model.save("s2s")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100

In [None]:
y, x = next(datagen)

In [None]:
y

[array([[  0.,  22.,   3., ...,   0.,   0.,   0.],
        [  0.,  40.,   3., ...,   0.,   0.,   0.],
        [  0.,  98.,   3., ...,   0.,   0.,   0.],
        ...,
        [  0.,  14.,  68., ...,   0.,   0.,   0.],
        [  0., 187., 115., ...,   0.,   0.,   0.],
        [  0.,  72.,   3., ...,   0.,   0.,   0.]], dtype=float32),
 array([[  0.,  75.,   3., ...,   0.,   0.,   0.],
        [  0.,  43.,   3., ...,   0.,   0.,   0.],
        [  0., 116., 154., ...,   0.,   0.,   0.],
        ...,
        [  0.,  34.,  72., ...,   0.,   0.,   0.],
        [  0., 285., 144., ...,   0.,   0.,   0.],
        [  0.,  79.,   3., ...,   0.,   0.,   0.]], dtype=float32)]