# Encoder Decoder 

### Imports

In [20]:
import pandas as pd
import numpy as np
import re
import string

### Importing Data

In [3]:
df = pd.read_csv('ASL_English.csv')
df.head()

Unnamed: 0,English,ASL Gloss
0,Do you want a ride to the mall?,M-A-L-L RIDE WANT YOU Q
1,Yes I want to start buying Christmas gifts soon,YES SOON CHRISTMAS GIFTS START BUYING WANT ME
2,Please sit in this chair,THIS CHAIR PLEASE SIT
3,I like to fly small planes,SMALL PLANES FLY LIKE ME
4,He will go later,HE GO WILL


In [10]:
# AVG/Mode in ASL Gloss 
# 18 characters
df['ASL Gloss'].str.len().mode()

0    18
Name: ASL Gloss, dtype: int64

### Preprocessing

Note:
- Replace the numbers/digits
- Check regarding Finger spellings
- Check if it is required to add start and end tokens to target sequences

In [None]:
# Remove extra spaces
df['English'] = df['English'].apply(lambda x: x.strip())
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: x.strip())

# Lowercase all characters
df['English'] = df['English'].apply(lambda x: x.lower())
df['ASL Gloss'] = df['ASL Gloss'].apply (lambda x: x.lower())

In [None]:
# Remove quotes # Might not need this
# df['English'] = df['English'].apply(lambda x: re.sub (r"'", '', x))
# df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: re.sub (r"'", '', x))

In [None]:
# Remove all special character
df['English'] = df['English'].apply(lambda x: ' '.join (ch for ch in x if ch not in set(string.punctuation)))
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: ' '.join (ch for ch in x if ch not in set(string.punctuation)))

**Replace them later** and remove all numbers/digits

In [18]:
# Check if dataset has numbers
print(df['English'].str.contains(r'\d').any())
print(df['ASL Gloss'].str.contains(r'\d').any())

True
True


In [31]:
replacements = {'1': "one", '2':"two", '3':"three", '4':"four", '5':"five", '6':"six", '7':"seven", '8':"eight", '9':"nine", '0':"zero"}
df['English'] = df['English'].apply(lambda x: re.sub('(\d)', lambda m: replacements[m.group()], x))
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: re.sub('(\d)', lambda m: replacements[m.group()], x))

'one-seven-one A-N-Y-W-H-E-R-E L-A-N-E nine-eight-seven-six-five'

In [34]:
# Add tokens to target sequence
df['English'] = df['English'].apply(lambda x : 'START_ ' + x + ' _END')

0            START_ Do you want a ride to the mall? _END
1      START_ Yes I want to start buying Christmas gi...
2                   START_ Please sit in this chair _END
3                 START_ I like to fly small planes _END
4                           START_ He will go later _END
                             ...                        
272                    START_ Sit in the wheelchair _END
273                        START_ You have a sprain _END
274          START_ You need to get to the hospital _END
275            START_ Im dizzy and my stomach hurts _END
276                        START_ Well go to the ER _END
Name: English, Length: 277, dtype: object

In [35]:
# Get English and ASL Vocabulary
all_eng_words = set()

for eng in df ['English'] :
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_ASL_words = set()

for asl in df ['ASL Gloss'] :
    for word in asl.split():
        if word not in all_ASL_words:
            all_ASL_words.add(word)

In [39]:
print("English Words:", len(all_eng_words))
print("ASL Words:", len(all_ASL_words))

English Words: 575
ASL Words: 478


In [46]:
input_words = sorted(list(all_ASL_words))
target_words = sorted(list(all_eng_words) )
num_encoder_tokens = len(all_ASL_words)
num_decoder_tokens = len(all_eng_words)
num_encoder_tokens, num_decoder_tokens

(478, 575)

In [52]:
num_decoder_tokens += 1
print (num_decoder_tokens)

input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

576


### Split the data into train and test

In [53]:
from sklearn.model_selection import train_test_split

x, y = df['ASL Gloss'], df['English']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)
x_train.shape, x_test.shape

((249,), (28,))

### Encoder-Decoder Architecture

In [47]:
from keras.layers import Dropout
from keras.layers import Input, LSTM, Embedding, Dense

latent_dim = 64

# Encoder
encoder_inputs = Input(shape=(None, ) )
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero = True) (encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Discard 'encoder outputs' and only keep the states.
encoder_states = [state_h, state_c]

In [49]:
from keras.models import Model

# Set up the decoder, using 'encoder states' as initial state.
decoder_inputs = Input(shape= (None, ) )
dec_emb_layer = Embedding (num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer (decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM (latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,initial_state = encoder_states)
decoder_dense = Dense (num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# 'encoder input data' & 'decoder input datas into 'decoder target data'
model = Model([encoder_inputs, decoder_inputs] , decoder_outputs)

In [50]:
model.compile(optimizer='rmsprop', loss='categorical crossentropy')

In [51]:
#Encode the input sequence to get the "thought vectors"
encoder_model = Model (encoder_inputs, encoder_states)

# Decoder setup
#Below tensors will hold the states of the previous time step
decoder_state_input_h = Input (shape= (latent_dim, ))
decoder_state_input_c = Input (shape= (latent_dim, ))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer (decoder_inputs) # Get the embeddings of the decoder sequence

#To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm (dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense (decoder_outputs2) # A dense softmax layer to generate prob dist. over the target voc

# Final decoder model
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

In [None]:
def decode_sequence (input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict (input_seq)
    #Generate empty target sequence of length 1.
    target_seq= np.zeros ( (1,1))
    # Popula te the first cha ra cter of target sequence with the start chara cter.
    target_seq[0, 0] = target_token_index[' START_']
    # Sampling loop for a ba tch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict ([target_seq] + states_value)
        
        # Sample a token
        sampled_token_index= np.argmax (output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index [sampled_token_index]
        decoded_sentence += ' '+sampled_char
        
        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char = "_END" or len(decoded_sentence) > 50):
                stop_condition = True
        
        #Update the target sequence (of length 1) .
        target_seq= np.zeros ((1,1))
        target_seq [0, 0] = sampled_token_index
        
        # Update states
        states_value = [h, c]
    return decoded_sentence