link: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('fra.txt', delimiter = '\t', names=['english', 'french', 'attribute'])

In [3]:
df.head()

Unnamed: 0,english,french,attribute
0,Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Marche.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Bouge !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Hi.,Salut !,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
4,Hi.,Salut.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [4]:
df.drop(df.columns[-1], axis = 1, inplace = True)

In [5]:
df.head()

Unnamed: 0,english,french
0,Go.,Va !
1,Go.,Marche.
2,Go.,Bouge !
3,Hi.,Salut !
4,Hi.,Salut.


In [6]:
input_text = df['english'].iloc[:10000].values

In [7]:
df['french'] = df['french'].apply(lambda x : '\t'+x+'\n')

In [8]:
output_text = df['french'].iloc[:10000].values

In [9]:
len(input_text)

10000

In [10]:
input_characters = set()
output_characters= set()

for text in input_text:
    for char in text:
        input_characters.add(char)

for text in output_text:
    for char in text:
        output_characters.add(char)

In [11]:
len(input_characters), len(output_characters)

(71, 93)

In [12]:
num_encoder_length = len(input_characters)
num_decoder_length = len(output_characters)

max_encoder_seq_length = max([len(text) for text in input_text])
max_decoder_seq_length = max([len(text) for text in output_text])

In [13]:
print(num_encoder_length, num_decoder_length, max_encoder_seq_length, max_decoder_seq_length)

71 93 15 59


In [14]:
## Tokenizing the characters
input_token_index = dict([(char, i) for i, char in enumerate(sorted(input_characters))])
output_token_index = dict([(char, i) for i , char in enumerate(sorted(output_characters))])

In [15]:
# input_token_index, output_token_index

In [16]:
## encoder input data to be passed in the encoder
encoder_input_data = np.zeros((len(input_text), max_encoder_seq_length, num_encoder_length), dtype='float32')

In [17]:
# Decoder input data that is passed in decoder as context vector
decoder_input_data = np.zeros((len(input_text), max_decoder_seq_length, num_decoder_length), dtype
                              ='float32')

In [18]:
## decoder output data
decoder_output_data = np.zeros((len(output_text), max_decoder_seq_length, num_decoder_length), dtype ='float32')

In [19]:
## Creating vectors of data

In [20]:
for i, (input_data, output_data) in enumerate(zip(input_text, output_text)):
    ## For encoder input data
    for t, char in enumerate(input_data):
        encoder_input_data[i, t, input_token_index[char]] = 1
    encoder_input_data[i, t+1:, input_token_index[' ']] = 1
    
    ## For decoder input data and decoder output_data
    for i, char in enumerate(output_data):
        decoder_input_data[i, t, output_token_index[char]] = 1
        # decoder output is ahead of decoder input by  one timestamp
        if t > 0:
            # decoder target will be ahead by one timestamp
            # and will not include the satrt character
            decoder_output_data[i, t-1, output_token_index[char]] = 1
    decoder_input_data[i, t+1:, output_token_index[' ']] = 1
    decoder_output_data[i, t:, output_token_index[' ']] = 1  
    

In [21]:
encoder_input_data.shape, decoder_input_data.shape, decoder_output_data.shape

((10000, 15, 71), (10000, 59, 93), (10000, 59, 93))

In [22]:
## Now creating the encoder and decoder sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense


batch_size = 64 
epochs = 100
latent_dim = 256 # Letent dimensionality of the encoding space
num_samples = len(input_text)

In [23]:
# Building the encoder
encoder_input = Input(shape=(None, num_encoder_length))
encoder = LSTM(latent_dim, return_state=True)
encoder_output , state_h,state_c = encoder(encoder_input)
encoder_state = [state_h, state_c]

In [24]:
## Building the decoder
decoder_input = Input(shape=(None, num_decoder_length))
# setup our decoder to give full output sequence
# and to return internal state as well. we don't use the return state
# in the training model but we will use them interface

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state = True)
decoder_output, _, _ = decoder_lstm(decoder_input, initial_state=encoder_state)
decoder_dense = Dense(num_decoder_length, activation='softmax')
decoder_output = decoder_dense(decoder_output)

In [None]:
model = Model([encoder_input, decoder_input], decoder_output)

model.compile(optimizer='rmsprop', loss = 'categorical_crossentropy', metrics=['accuracy'])

model.fit([encoder_input_data, decoder_input_data], decoder_output_data, 
        batch_size=batch_size,
        epochs=epochs, 
        validation_split=0.2
        )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
 18/125 [===>..........................] - ETA: 2:08 - loss: 0.1486 - accuracy: 0.0048