In [1]:
import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding,Bidirectional,RepeatVector, Concatenate, Activation, Dot, Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import keras.backend as K

Using TensorFlow backend.


In [2]:
BATCH_SIZE = 64
EPOCHS = 40  
LATENT_DIM = 512
LATENT_DIM_DECODER = 512 
NUM_SAMPLES = 20000  
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200

In [20]:
def softmax_over_time(x):
    assert(K.ndim(x)>2)
    e = K.exp(x - K.max(x, axis=1, keepdims=True))
    s = K.sum(e, axis=1, keepdims=True)
    return e / s

Data Loading

In [3]:
input_texts=[]
target_texts=[]
target_texts_inputs=[]
t=0
for line in open('ben.txt',encoding="utf-8"):
    t+=1
    if t>NUM_SAMPLES:
        break
    if '\t' not in line:
        continue
    # split up the input and translation
    input_text,translation,_=line.rstrip().split('\t')
    target_text=translation + ' <eos>'
    target_text_input='<sos> ' + translation
    
    
    input_texts.append(input_text)
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)

    
print(input_texts[152])
print(target_texts[0])
print(target_texts_inputs[0])
print(t)
    
    
    

Have fun.
যাও। <eos>
<sos> যাও।
4349


Tokenize the inputs

In [4]:
tokenizer_input=Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_input.fit_on_texts(input_texts)
input_sequences=tokenizer_input.texts_to_sequences(input_texts)
print(input_sequences[152])

[19, 421]


In [5]:
tokenizer_input.texts_to_sequences(["She knows where we live"])

[[31, 243, 53, 32, 88]]

In [6]:
word2idx_inputs=tokenizer_input.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))
word2idx_inputs['the']

Found 1875 unique input tokens.


6

In [7]:
max_len_input = max(len(s) for s in input_sequences)
max_len_input

19

In [8]:
tokenizer_output=Tokenizer(num_words=MAX_NUM_WORDS,filters='')
tokenizer_output.fit_on_texts(target_texts+target_texts_inputs)
target_sequences=tokenizer_output.texts_to_sequences(target_texts)
target_inputs_sequences=tokenizer_output.texts_to_sequences(target_texts_inputs)
print(target_sequences[0])
print(target_inputs_sequences[0])

[167, 1]
[2, 167]


In [9]:
word2idx_output = tokenizer_output.word_index
print('Found %s unique output tokens.' % len(word2idx_output))

Found 3551 unique output tokens.


In [10]:
num_words_output = len(word2idx_output) + 1
max_len_target = max(len(s) for s in target_sequences)
max_len_target

19

Pad the sequnces

In [11]:
encoder_inputs=pad_sequences(input_sequences,max_len_input)
print("encoder_inputs.shape:", encoder_inputs.shape)
encoder_inputs[0]

encoder_inputs.shape: (4349, 19)


array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0, 28])

In [12]:
decoder_inputs = pad_sequences(target_inputs_sequences, maxlen=max_len_target, padding='post')
print("decoder_inputs.shape:", decoder_inputs.shape)

decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')

decoder_inputs.shape: (4349, 19)


In [13]:
print('Loading word vectors...')
word2vec={}
with open(r'D:\udemy\glove.6B.200d.txt',encoding="utf-8") as f:
    for line in f:
        values=line.split()
        word=values[0]
        vec=np.asarray(values[1:],dtype='float32')
        word2vec[word]=vec
print('Found %s word vectors.' % len(word2vec))        

Loading word vectors...
Found 400000 word vectors.


In [14]:
print('Filling pre-trained embeddings...')
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
  if i < MAX_NUM_WORDS:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


creation of  embedding layer

In [15]:
embedding_layer=Embedding(num_words,EMBEDDING_DIM,weights=[embedding_matrix],input_length=max_len_input)


One hot encoding the targets

In [16]:
decoder_targets_one_hot = np.zeros(
  (
    len(input_texts),
    max_len_target,
    num_words_output
  ),
  dtype='float32'
)

# assign the values
for i, d in enumerate(decoder_targets):
  for t, word in enumerate(d):
    if word != 0:
      decoder_targets_one_hot[i, t, word] = 1

In [17]:
decoder_targets_one_hot[0,3,4]

0.0

Model building

In [28]:
encoder_input=Input(shape=(max_len_input,))
x=embedding_layer(encoder_input)
encoder=Bidirectional(LSTM(LATENT_DIM,return_sequences=True,dropout=0.5))
encoder_outputs=encoder(x)

decoder_input=Input(shape=(max_len_target,))
decoder_embedding=Embedding(num_words_output,EMBEDDING_DIM)
decoder_input_x=decoder_embedding(decoder_input)

Attention Layer

In [29]:
attn_repeat_layer=RepeatVector(max_len_input)
attn_concat_layer=Concatenate(axis=-1)
attn_dense_1=Dense(16,activation='tanh')
attn_dense_2=Dense(1,activation=softmax_over_time)
attn_dot=Dot(axes=1)


In [30]:
def one_step_attention(h,st_1):
    
    st_1=attn_repeat_layer(st_1)
    x=attn_concat_layer([h,st_1])
    x=attn_dense_1(x)
    x=attn_dense_2(x)
    context=attn_dot([x,h])
    
    return context

Deocder Architecture

In [31]:
decoder_lstm=LSTM(LATENT_DIM_DECODER,return_state=True)
decoder_dense=Dense(num_words_output,activation='softmax')

initial_s = Input(shape=(LATENT_DIM_DECODER,), name='s0')
initial_c = Input(shape=(LATENT_DIM_DECODER,), name='c0')
context_last_word_concat_layer = Concatenate(axis=2)


s=initial_s
c=initial_c

outputs=[]

for t in range(max_len_target):
    context=one_step_attention(encoder_outputs,s)
    selector = Lambda(lambda x: x[:, t:t+1])
    xt = selector(decoder_input_x)
    decoder_lstm_input=context_last_word_concat_layer([context,xt])
    o,s,c=decoder_lstm(decoder_lstm_input,initial_state=[s,c])
    decoder_outputs=decoder_dense(o)
    outputs.append(decoder_outputs)

In [32]:
def stack_and_transpose(x):
  x = K.stack(x) 
  x = K.permute_dimensions(x, pattern=(1, 0, 2)) 
  return x

In [33]:
stacker=Lambda(stack_and_transpose)
outputs=stacker(outputs)

In [35]:
model = Model(
  inputs=[
    encoder_input,
    decoder_input,
    initial_s, 
    initial_c,
  ],
  outputs=outputs
)

In [36]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 19)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 19, 200)      375200      input_3[0][0]                    
__________________________________________________________________________________________________
s0 (InputLayer)                 (None, 512)          0                                            
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 19, 1024)     2920448     embedding_1[1][0]                
____________________________________________________________________________________________

In [37]:
def custom_loss(y_true, y_pred):
  # both are of shape N x T x K
  mask = K.cast(y_true > 0, dtype='float32')
  out = mask * y_true * K.log(y_pred)
  return -K.sum(out) / K.sum(mask)


def acc(y_true, y_pred):
  # both are of shape N x T x K
  targ = K.argmax(y_true, axis=-1)
  pred = K.argmax(y_pred, axis=-1)
  correct = K.cast(K.equal(targ, pred), dtype='float32')

  # 0 is padding, don't include those
  mask = K.cast(K.greater(targ, 0), dtype='float32')
  n_correct = K.sum(mask * correct)
  n_total = K.sum(mask)
  return n_correct / n_total

In [38]:
model.compile(optimizer='adam', loss=custom_loss, metrics=[acc])

In [39]:
z = np.zeros((len(encoder_inputs), LATENT_DIM_DECODER))
model.fit([encoder_inputs, decoder_inputs,z,z], decoder_targets_one_hot,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.callbacks.History at 0x1cd002b6a90>

Make Translations

In [40]:
encoder_model=Model(encoder_input,encoder_outputs)


encoder_outputs_as_inputs=Input(shape=(max_len_input,LATENT_DIM*2,))
decoder_inputs_single=Input(shape=(1,))
decoder_inputs_single_x=decoder_embedding(decoder_inputs_single)


context=one_step_attention(encoder_outputs_as_inputs,initial_s)

decoder_lstm_input=context_last_word_concat_layer([context,decoder_inputs_single_x])

o,s,c=decoder_lstm(decoder_lstm_input,initial_state=[initial_s,initial_c])
decoder_outputs=decoder_dense(o)



decoder_model = Model(
  inputs=[decoder_inputs_single,encoder_outputs_as_inputs,initial_s,initial_c], 
  outputs=[decoder_outputs,s,c]
)


In [41]:
idx2word_eng = {v:k for k, v in word2idx_inputs.items()}
idx2word_trans = {v:k for k, v in word2idx_output.items()}

In [48]:

def decode_sequence(input_seq):
  
  enc_out = encoder_model.predict(input_seq)

 
  target_seq = np.zeros((1, 1))
  

  target_seq[0, 0] = word2idx_output['<sos>']

 
  eos = word2idx_output['<eos>']


 
  s = np.zeros((1, LATENT_DIM_DECODER))
  c = np.zeros((1, LATENT_DIM_DECODER))


  # Create the translation
  output_sentence = []
  for _ in range(max_len_target):
    o, s, c = decoder_model.predict([target_seq, enc_out, s, c])
        

    # Get next word
    idx = np.argmax(o.flatten())

    # End sentence of EOS
    if eos == idx:
      break

    word = ''
    if idx > 0:
      word = idx2word_trans[idx]
      output_sentence.append(word)

  
   
    target_seq[0, 0] = idx

  return ' '.join(output_sentence)

In [90]:
i = np.random.choice(len(input_texts))
input_seq = encoder_inputs[i:i+1]
translation = decode_sequence(input_seq)
print('-')
print('Input:', input_texts[i])
print('Translation:', translation)

-
Input: Tom started yelling.
Translation: টম চেঁচানো আরম্ভ করলো।


In [84]:
sentence=[""]
sequence=tokenizer_input.texts_to_sequences(sentence)
input_seq=pad_sequences(sequence,max_len_input)
print(decode_sequence(input_seq))

আমার এটা আছে।
