# Encoder Decoder 

### Imports

In [1]:
import pandas as pd
import numpy as np
import re
import string

### Importing Data

In [2]:
df = pd.read_csv('ASL_English.csv')
df.head()

Unnamed: 0,English,ASL Gloss
0,Do you want a ride to the mall?,M-A-L-L RIDE WANT YOU Q
1,Yes I want to start buying Christmas gifts soon,YES SOON CHRISTMAS GIFTS START BUYING WANT ME
2,Please sit in this chair,THIS CHAIR PLEASE SIT
3,I like to fly small planes,SMALL PLANES FLY LIKE ME
4,He will go later,HE GO WILL


In [3]:
df.shape

(1670, 2)

In [4]:
# AVG/Mode in ASL Gloss 
# 18 characters
df['ASL Gloss'].str.len().mode()

# Max 
df['ASL Gloss'].str.len().max()

116

In [5]:
# AVG/Mode in English
# 22 characters
df['English'].str.len().mode()

# Max
df['English'].str.len().max()

120

### Preprocessing

Note:
- Replace the numbers/digits
- Check regarding Finger spellings
- Check if it is required to add start and end tokens to target sequences

In [6]:
replacements = {'1': "one ", '2':"two ", '3':"three ", '4':"four ", '5':"five ", '6':"six ", '7':"seven ", '8':"eight ", '9':"nine ", '0':"zero "}
df['English'] = df['English'].apply(lambda x: re.sub('(\d)', lambda m: replacements[m.group()], x))
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: re.sub('(\d)', lambda m: replacements[m.group()], x))

In [7]:
# Remove extra spaces
df['English'] = df['English'].apply(lambda x: x.strip())
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: x.strip())

# Lowercase all characters
df['English'] = df['English'].apply(lambda x: x.lower())
df['ASL Gloss'] = df['ASL Gloss'].apply (lambda x: x.lower())

In [8]:
df['English'] = df['English'].apply(lambda x: x.replace('  ', ' '))
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: x.replace('  ', ' '))

In [9]:
df.head()

Unnamed: 0,English,ASL Gloss
0,do you want a ride to the mall?,m-a-l-l ride want you q
1,yes i want to start buying christmas gifts soon,yes soon christmas gifts start buying want me
2,please sit in this chair,this chair please sit
3,i like to fly small planes,small planes fly like me
4,he will go later,he go will


In [10]:
# Remove quotes # Might not need this
# df['English'] = df['English'].apply(lambda x: re.sub (r"'", '', x))
# df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: re.sub (r"'", '', x))

In [11]:
# Remove all special character
df['English'] = df['English'].apply(lambda x: ''.join (ch for ch in x if ch not in set(string.punctuation)))
df['ASL Gloss'] = df['ASL Gloss'].apply(lambda x: ''.join (ch for ch in x if ch not in set(string.punctuation)))

In [12]:
df.head()

Unnamed: 0,English,ASL Gloss
0,do you want a ride to the mall,mall ride want you q
1,yes i want to start buying christmas gifts soon,yes soon christmas gifts start buying want me
2,please sit in this chair,this chair please sit
3,i like to fly small planes,small planes fly like me
4,he will go later,he go will


**Replace them later** and remove all numbers/digits

In [13]:
# Check if dataset has numbers
print(df['English'].str.contains(r'\d').any())
print(df['ASL Gloss'].str.contains(r'\d').any())

False
False


In [14]:
# Add tokens to target sequence
df['English'] = df['English'].apply(lambda x : 'START_ ' + x + ' _END')

In [15]:
df.head()

Unnamed: 0,English,ASL Gloss
0,START_ do you want a ride to the mall _END,mall ride want you q
1,START_ yes i want to start buying christmas gi...,yes soon christmas gifts start buying want me
2,START_ please sit in this chair _END,this chair please sit
3,START_ i like to fly small planes _END,small planes fly like me
4,START_ he will go later _END,he go will


In [16]:
df['ASL Gloss'].str.len().sort_values(ascending=False).head()

455    108
558    105
918    104
712    104
531    103
Name: ASL Gloss, dtype: int64

In [17]:
df['English'].str.len().sort_values(ascending=False).head()

632    144
921    130
455    125
918    118
285    117
Name: English, dtype: int64

In [18]:
# Get English and ASL Vocabulary
all_eng_words = set()

for eng in df ['English'] :
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_ASL_words = set()

for asl in df ['ASL Gloss'] :
    for word in asl.split():
        if word not in all_ASL_words:
            all_ASL_words.add(word)

#max_length_src = 45
#max_length_tar = 53
#max_length_src = 108
#max_length_tar = 144


In [35]:
# Vocabulary of English
all_eng_words = set()
for eng in df['English']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

# Vocabulary of ASL 
all_ASL_words = set()
for asl in df['ASL Gloss']:
    for word in asl.split():
        if word not in all_ASL_words:
            all_ASL_words.add(word)

# Max Length of source sequence
lenght_list=[]
for l in df ['English']:
    lenght_list.append(len(l.split(' ')))
max_length_tar = np.max(lenght_list)
print("Max length target: ", max_length_tar)

# Max Length of target sequence
lenght_list=[]
for l in df ['ASL Gloss']:
    lenght_list.append(len(l.split(' ')))
max_length_src = np.max(lenght_list)
print("Max length sorce: ", max_length_src)

input_words = sorted(list(all_ASL_words))
target_words = sorted(list(all_eng_words))

# Calculate Vocab size for both source and target
num_encoder_tokens = len(all_ASL_words)
num_decoder_tokens = len(all_eng_words)
num_decoder_tokens += 1 # For zero padding
print("Encoder token:", num_encoder_tokens)
print("Decoder token:", num_decoder_tokens)

# Create word to token dictionary for both source and target
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create token to word dictionary for both source and target
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

Max length target:  25
Max length sorce:  18
Encoder token: 1947
Decoder token: 2002


In [19]:
print("English Words:", len(all_eng_words))
print("ASL Words:", len(all_ASL_words))

English Words: 2001
ASL Words: 1947


In [20]:
input_words = sorted(list(all_ASL_words))
target_words = sorted(list(all_eng_words))
num_encoder_tokens = len(all_ASL_words)
num_decoder_tokens = len(all_eng_words)
num_encoder_tokens, num_decoder_tokens

(1947, 2001)

In [21]:
num_decoder_tokens += 1
print (num_decoder_tokens)

input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

2002


### Split the data into train and test

In [22]:
from sklearn.model_selection import train_test_split

x, y = df['ASL Gloss'], df['English']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.05)
x_train.shape, x_test.shape

((1586,), (84,))

### Encoder-Decoder Architecture

In [23]:
from keras.layers import Dropout
from keras.layers import Input, LSTM, Embedding, Dense

latent_dim = 64

# Encoder
encoder_inputs = Input(shape=(None, ) )
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero = True) (encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Discard 'encoder outputs' and only keep the states.
encoder_states = [state_h, state_c]

In [24]:
from keras.models import Model

# Set up the decoder, using 'encoder states' as initial state.
decoder_inputs = Input(shape= (None, ) )
dec_emb_layer = Embedding (num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer (decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM (latent_dim, return_sequences = True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,initial_state = encoder_states)
decoder_dense = Dense (num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# 'encoder input data' & 'decoder input datas into 'decoder target data'
model = Model([encoder_inputs, decoder_inputs] , decoder_outputs)

In [25]:
model.compile(optimizer='rmsprop', loss='categorical crossentropy')

In [26]:
#Encode the input sequence to get the "thought vectors"
encoder_model = Model (encoder_inputs, encoder_states)

# Decoder setup
#Below tensors will hold the states of the previous time step
decoder_state_input_h = Input (shape= (latent_dim, ))
decoder_state_input_c = Input (shape= (latent_dim, ))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer (decoder_inputs) # Get the embeddings of the decoder sequence

#To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm (dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense (decoder_outputs2) # A dense softmax layer to generate prob dist. over the target voc

# Final decoder model
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

In [27]:
def decode_sequence (input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict (input_seq)
    
    #Generate empty target sequence of length 1.
    target_seq= np.zeros ( (1,1))

    # Popula te the first cha ra cter of target sequence with the start chara cter.
    target_seq[0, 0] = target_token_index['START_']
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict ([target_seq] + states_value)
        
        # Sample a token
        sampled_token_index= np.argmax (output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index [sampled_token_index]
        decoded_sentence += ' '+sampled_char
        
        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == "_END" or len(decoded_sentence) > 50):
                stop_condition = True
        
        #Update the target sequence (of length 1) .
        target_seq= np.zeros ((1,1))
        target_seq [0, 0] = sampled_token_index
        
        # Update states
        states_value = [h, c]
    return decoded_sentence

In [28]:
def generate_batch (X = x_train, y = y_train, batch_size= 128):
    '''Generate a batch of data '''
    while True:
        for j in range (0, len (X), batch_size) :
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype='float32')
            decoder_input_data = np.zeros ( (batch_size, max_length_tar), dtype='float32')
            decoder_target_data = np.zeros ( (batch_size, max_length_tar, num_decoder_tokens), dtype='float32')
            for i, (input_text, target_text) in enumerate (zip (X.iloc[j:j+batch_size], y.iloc[j:j+batch_size])):
                for t, word in enumerate (input_text.split ()):
                    encoder_input_data [i, t] = input_token_index [word] # encoder input seq
                for t, word in enumerate(target_text.split ()) :
                    if t > len(target_text.split ()) - 1:
                        decoder_input_data [i, t] = target_token_index [word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START token
                        # offset by one timestep
                        decoder_target_data [i, t - 1, target_token_index [word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Results

In [29]:
train_gen = generate_batch(x_train, y_train, batch_size = 1)
k=-1

In [30]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input ASL_sentence:', x_train.iloc[k: k+1].values[0])
print('Actual English Translation:', y_train.iloc[k: k+1].values[0][7:-5])
print('Predicted English Translation:', decoded_sentence)

Input ASL_sentence: weather today what sunny
Actual English Translation: today the weather is sunny
Predicted English Translation:  boston george boston george gymnastics


In [31]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input ASL_sentence:', x_train.iloc[k: k+1].values[0])
print('Actual English Translation:', y_train.iloc[k: k+1].values[0][7:-5])
print('Predicted English Translation:', decoded_sentence)

Input ASL_sentence: which state you from
Actual English Translation: which state are you from
Predicted English Translation:  hamburgers smells dollars tend


In [33]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input ASL_sentence:', x_train.iloc[k: k+1].values[0])
print('Actual English Translation:', y_train.iloc[k: k+1].values[0][7:-5])
print('Predicted English Translation:', decoded_sentence)

Input ASL_sentence: good deals my brother always find why he thrifty
Actual English Translation: my brother always finds good deals because he is thrifty
Predicted English Translation:  grocery grocery watermelon monday
