# Encoder Decoder 

### Imports

In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.utils import shuffle

### Importing Data

In [2]:
df = pd.read_csv('..\Data\Double_Dataset_EnglishToGloss.csv')
df.head()

Unnamed: 0,English,ASL Gloss
0,My sister is having another baby,ANOTHER BABY MY SISTER BORN-WILL
1,I love to cook hamburgers on the XXXXX,XXXXX HAMBURGER COOK ME LOVE
2,I like to ice skate on our XXXXX,OUR XXXXX ICE SKATE ME LIKE
3,I like to be active and not sit and watch TV a...,ALL DAY ME LIKE ACTIVE NOT SIT WATCH TV
4,Are you prepared for hurricane XXXXX,HURRICANE XXXXX YOU READY PREPARE


In [3]:
df.shape

(3716, 2)

### Preprocessing

Note:
- Replace the numbers/digits
- Check regarding Finger spellings
- Check if it is required to add start and end tokens to target sequences

In [4]:
replacements = {'1': " one ", '2':" two ", '3':" three ", '4':" four ", '5':" five ", '6':" six ", '7':" seven ", '8':" eight ", '9':" nine ", '0':" zero "}
df['English'] = df['English'].apply(lambda x: re.sub('(\d)', lambda m: replacements[m.group()], x))
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: re.sub('(\d)', lambda m: replacements[m.group()], x))

# Remove extra spaces
df['English'] = df['English'].apply(lambda x: x.strip())
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: x.strip())

# Lowercase all characters
df['English'] = df['English'].apply(lambda x: x.lower())
df['ASL Gloss'] = df['ASL Gloss'].apply (lambda x: x.lower())

# Removing double spaces
df['English'] = df['English'].apply(lambda x: x.replace('  ', ' '))
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: x.replace('  ', ' '))

# Remove all special character
df['English'] = df['English'].apply(lambda x: ''.join (ch for ch in x if ch not in set(string.punctuation)))
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: ''.join (ch for ch in x if ch not in set(string.punctuation)))

# Add tokens to target sequence
df['English'] = df['English'].apply(lambda x : 'START_ ' + x + ' _END')

In [5]:
df.head(10)

Unnamed: 0,English,ASL Gloss
0,START_ my sister is having another baby _END,another baby my sister bornwill
1,START_ i love to cook hamburgers on the xxxxx ...,xxxxx hamburger cook me love
2,START_ i like to ice skate on our xxxxx _END,our xxxxx ice skate me like
3,START_ i like to be active and not sit and wat...,all day me like active not sit watch tv
4,START_ are you prepared for hurricane xxxxx _END,hurricane xxxxx you ready prepare
5,START_ the dinner party was awkward because mo...,dinner party awkward why most us not know each...
6,START_ my favorite author is me _END,my favorite author who me
7,START_ i love anything that has chocolate in i...,me love anything have chocolate inside
8,START_ do you like to watch baseball _END,baseball watch you like
9,START_ my daughter gave me beautiful flowers _END,beautiful flowers my daughter giveme


In [8]:
# Vocabulary of English
all_eng_words = set()
for eng in df['English']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

# Vocabulary of ASL 
all_ASL_words = set()
for asl in df['ASL Gloss']:
    for word in asl.split():
        if word not in all_ASL_words:
            all_ASL_words.add(word)

In [9]:
# Max Length of source sequence
lenght_list=[]
for l in df ['English']:
    lenght_list.append(len(l.split(' ')))
max_length_tar = np.max(lenght_list)
print("Max length target: ", max_length_tar)

Max length target:  41


In [10]:
# Max Length of target sequence
lenght_list=[]
for l in df ['ASL Gloss']:
    lenght_list.append(len(l.split(' ')))
max_length_src = np.max(lenght_list)
print("Max length sorce: ", max_length_src)

Max length sorce:  24


In [11]:
input_words = sorted(list(all_ASL_words))
target_words = sorted(list(all_eng_words))

# Calculate Vocab size for both source and target
num_encoder_tokens = len(all_ASL_words) + 1
num_decoder_tokens = len(all_eng_words) + 1

num_encoder_tokens, num_decoder_tokens

(1639, 1727)

In [12]:
num_decoder_tokens += 1 # For zero padding
num_decoder_tokens

1728

In [13]:
# Create word to token dictionary for both source and target
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create token to word dictionary for both source and target
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [14]:
def write_list_to_file(var_list):
    outputFile = open( "myVars.txt", "w")
    outputFile.write(str(var_list))
    outputFile.flush()
    outputFile.close()

var_list = [max_length_src, max_length_tar, num_encoder_tokens, num_decoder_tokens, input_token_index, target_token_index, reverse_target_char_index]
write_list_to_file(var_list)

In [15]:
shuffle(df).head(10)

Unnamed: 0,English,ASL Gloss
2434,START_ do you like to walk _END,you like walk
1467,START_ he ran _END,he ran
3449,START_ i collect stamps _END,stamps collect me
2345,START_ wow this week i am busy _END,wow this week me busy
2179,START_ where were you born _END,you born where
1520,START_ the trees are beautiful in japan _END,japan trees beautiful
3231,START_ she drives _END,she drives
158,START_ my neighbor has eight children and four...,eight children four dog my neighbor has
2692,START_ my moms favorite sport is golf _END,golf my mom favorite sport
2830,START_ veterans day is wednesday november one ...,veterans day when wednesday november one one


Making a 90–10 train and test split and write a Python generator function to load the data in batches as follows:

In [16]:
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import tensorflow as tf

In [17]:
# Train - Test Split
X, y = df['ASL Gloss'], df['English']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((3344,), (372,))

Save the train and test dataframes for reproducing the results later, as they are shuffled.

In [19]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X.iloc[j:j+batch_size], y.iloc[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

Encoder - Decoder Model Architecture

In [20]:
latent_dim = 50

In [21]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [22]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# Use a softmax to generate a probability distribution over the target vocabulary for each time step
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [23]:
# Model Summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 50)     81950       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     86400       input_2[0][0]                    
______________________________________________________________________________________________

In [24]:
# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=[tf.keras.metrics.Accuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [25]:
from keras import callbacks
earlystopping = callbacks.EarlyStopping(monitor="val_loss",
                                        mode="min", patience=5,
                                        restore_best_weights=True, verbose = 1)

In [26]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 110

In [27]:
model.fit(generate_batch(X_test, y_test, batch_size = batch_size),
            batch_size = batch_size,
            steps_per_epoch = train_samples//batch_size,
            epochs=epochs,
            validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
            validation_steps = val_samples//batch_size, callbacks=[earlystopping])

Epoch 1/110
Epoch 2/110
Epoch 3/110
Epoch 4/110
Epoch 5/110
Epoch 6/110
Epoch 7/110
Epoch 8/110
Epoch 9/110
Epoch 10/110
Epoch 11/110
Epoch 12/110
Epoch 13/110
Epoch 14/110
Epoch 15/110
Epoch 16/110
Epoch 17/110
Epoch 18/110
Epoch 19/110
Epoch 20/110
Epoch 21/110
Epoch 22/110
Epoch 23/110
Epoch 24/110
Epoch 25/110
Epoch 26/110
Epoch 27/110
Epoch 28/110
Epoch 29/110
Epoch 30/110
Epoch 31/110
Epoch 32/110
Epoch 33/110
Epoch 34/110
Epoch 35/110
Epoch 36/110
Epoch 37/110
Epoch 38/110
Epoch 39/110
Epoch 40/110
Epoch 41/110
Epoch 42/110
Epoch 43/110
Epoch 44/110
Epoch 45/110
Epoch 46/110
Epoch 47/110
Epoch 48/110
Epoch 49/110
Epoch 50/110
Epoch 51/110
Epoch 52/110
Epoch 53/110
Epoch 54/110
Epoch 55/110
Epoch 56/110
Epoch 57/110
Epoch 58/110
Epoch 59/110
Epoch 60/110
Epoch 61/110
Epoch 62/110
Epoch 63/110
Epoch 64/110
Epoch 65/110
Epoch 66/110
Epoch 67/110
Epoch 68/110
Epoch 69/110
Epoch 70/110
Epoch 71/110
Epoch 72/110
Epoch 73/110
Epoch 74/110
Epoch 75/110
Epoch 76/110
Epoch 77/110
Epoch 78

<keras.callbacks.History at 0x7f75281b1d10>

Always remember to save the weights

In [28]:
model.save_weights('Weights/nmt_weights_v5.h5')
model.save('Weights/model_v5.h5')

model.save('model_v4.h5')

Load the weights, if you close the application

In [29]:
#model.load_weights('Weights_ASL/nmt_weights_v5.h5')

Inference Setup

In [30]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

Finally, we generate the output sequence by invoking the above setup in a loop as follows

Decode sample sequeces

In [31]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

Evaluation on Train Dataset

In [None]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input ASL sentence:', X_train.iloc[k:k+1].values[0])
print('Actual English Translation:', y_train.iloc[k:k+1].values[0][6:-4])
print('Predicted English Translation:', decoded_sentence[:-4])

In [None]:
asl_sentence = []
true_eng_trans = []
pred_eng_trans = []

for i in range(10):
    k+=1
    (input_seq, actual_output), _ = next(train_gen)
    decoded_sentence = decode_sequence(input_seq)
    asl_sentence.append(X_train.iloc[k:k+1].values[0])
    true_eng_trans.append(y_train.iloc[k:k+1].values[0][6:-4])
    pred_eng_trans.append(decoded_sentence[:-4])

for i in range(10):
    print('Input ASL sentence:', asl_sentence[i])
    print('Actual English Translation:', true_eng_trans[i])
    print('Predicted English Translation:', pred_eng_trans[i])
    print()