In [45]:
import os, sys

from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import keras.backend as K

In [2]:
BATCH_SIZE = 64
EPOCHS = 50  
LATENT_DIM = 256
NUM_SAMPLES = 10000  
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

Data Loading

In [69]:
input_texts=[]
target_texts=[]
target_texts_inputs=[]
t=0
for line in open('ben.txt',encoding="utf-8"):
    t+=1
    if t>NUM_SAMPLES:
        break
    if '\t' not in line:
        continue
    # split up the input and translation
    input_text,translation,_=line.rstrip().split('\t')
    target_text=translation + ' <eos>'
    target_text_input='<sos> ' + translation
    
    
    input_texts.append(input_text)
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)

    
print(input_texts[152])
print(target_texts[0])
print(target_texts_inputs[0])
print(t)
    
    
    

Have fun.
যাও। <eos>
<sos> যাও।
4349


Tokenize the inputs

In [68]:
tokenizer_input=Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_input.fit_on_texts(input_texts)
input_sequences=tokenizer_input.texts_to_sequences(input_texts)
print(input_sequences[152])

[19, 421]


In [70]:
tokenizer_input.texts_to_sequences(["She knows where we live"])

[[31, 243, 53, 32, 88]]

In [18]:
word2idx_inputs=tokenizer_input.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))
word2idx_inputs['the']

Found 1875 unique input tokens.


6

In [19]:
max_len_input = max(len(s) for s in input_sequences)
max_len_input

19

In [21]:
tokenizer_output=Tokenizer(num_words=MAX_NUM_WORDS,filters='')
tokenizer_output.fit_on_texts(target_texts+target_texts_inputs)
target_sequences=tokenizer_output.texts_to_sequences(target_texts)
target_inputs_sequences=tokenizer_output.texts_to_sequences(target_texts_inputs)
print(target_sequences[0])
print(target_inputs_sequences[0])

[167, 1]
[2, 167]


In [25]:
word2idx_output = tokenizer_output.word_index
print('Found %s unique output tokens.' % len(word2idx_output))

Found 3551 unique output tokens.


In [27]:
num_words_output = len(word2idx_output) + 1
max_len_target = max(len(s) for s in target_sequences)
max_len_target

19

Pad the sequnces

In [29]:
encoder_inputs=pad_sequences(input_sequences,max_len_input)
print("encoder_inputs.shape:", encoder_inputs.shape)
encoder_inputs[0]

encoder_inputs.shape: (4349, 19)


array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0, 28])

In [31]:
decoder_inputs = pad_sequences(target_inputs_sequences, maxlen=max_len_target, padding='post')
print("decoder_inputs.shape:", decoder_inputs.shape)

decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')

decoder_inputs.shape: (4349, 19)


In [32]:
print('Loading word vectors...')
word2vec={}
with open(r'D:\udemy\glove.6B.100d.txt',encoding="utf-8") as f:
    for line in f:
        values=line.split()
        word=values[0]
        vec=np.asarray(values[1:],dtype='float32')
        word2vec[word]=vec
print('Found %s word vectors.' % len(word2vec))        

Loading word vectors...
Found 400000 word vectors.


In [33]:
print('Filling pre-trained embeddings...')
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
  if i < MAX_NUM_WORDS:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


creation of  embedding layer

In [34]:
embedding_layer=Embedding(num_words,EMBEDDING_DIM,weights=[embedding_matrix],input_length=max_len_input)


One hot encoding the targets

In [35]:
decoder_targets_one_hot = np.zeros(
  (
    len(input_texts),
    max_len_target,
    num_words_output
  ),
  dtype='float32'
)

# assign the values
for i, d in enumerate(decoder_targets):
  for t, word in enumerate(d):
    if word != 0:
      decoder_targets_one_hot[i, t, word] = 1

In [39]:
decoder_targets_one_hot[0,3,4]

0.0

Model building

In [41]:
encoder_input=Input(shape=(max_len_input,))
x=embedding_layer(encoder_input)
encoder=LSTM(LATENT_DIM,return_state=True,dropout=0.5)
encoder_outputs,h,c=encoder(x)
encoder_state=[h,c]
decoder_input=Input(shape=(max_len_target,))
decoder_embedding=Embedding(num_words_output,EMBEDDING_DIM)
decoder_input_x=decoder_embedding(decoder_input)
decoder=LSTM(LATENT_DIM,return_sequences=True,return_state=True,dropout=0.5)
decoder_outputs,_,_=decoder(decoder_input_x,initial_state=encoder_state)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model=Model([encoder_input,decoder_input],decoder_outputs)

Instructions for updating:
Colocations handled automatically by placer.


In [42]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 19)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 19)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 19, 100)      187600      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 19, 100)      355200      input_2[0][0]                    
____________________________________________________________________________________________

In [43]:
def custom_loss(y_true, y_pred):
  # both are of shape N x T x K
  mask = K.cast(y_true > 0, dtype='float32')
  out = mask * y_true * K.log(y_pred)
  return -K.sum(out) / K.sum(mask)


def acc(y_true, y_pred):
  # both are of shape N x T x K
  targ = K.argmax(y_true, axis=-1)
  pred = K.argmax(y_pred, axis=-1)
  correct = K.cast(K.equal(targ, pred), dtype='float32')

  # 0 is padding, don't include those
  mask = K.cast(K.greater(targ, 0), dtype='float32')
  n_correct = K.sum(mask * correct)
  n_total = K.sum(mask)
  return n_correct / n_total

In [46]:
model.compile(optimizer='adam', loss=custom_loss, metrics=[acc])

In [47]:
model.fit([encoder_inputs, decoder_inputs], decoder_targets_one_hot,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x169ab9b5710>

Make Translations

In [48]:
encoder_model=Model(encoder_input,encoder_state)


decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]


decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

decoder_outputs,h,c=decoder(decoder_inputs_single_x,initial_state=decoder_states_inputs)
decoder_states=[h,c]
decoder_outputs=decoder_dense(decoder_outputs)
decoder_model = Model(
  [decoder_inputs_single] + decoder_states_inputs, 
  [decoder_outputs] + decoder_states
)


In [49]:
idx2word_eng = {v:k for k, v in word2idx_inputs.items()}
idx2word_trans = {v:k for k, v in word2idx_output.items()}

In [50]:
def decode_sequence(input_seq):
    states_value=encoder_model.predict(input_seq)
    target_seq=np.zeros((1,1))
    target_seq[0,0]=word2idx_output['<sos>']
    eos = word2idx_output['<eos>']
    
    output_sentence=[]
    
    for _ in range (max_len_target):
        output_tokens,h,c=decoder_model.predict([target_seq] + states_value)
        idx = np.argmax(output_tokens[0, 0, :])
        if idx==eos:
            break
        word = ''
        if idx > 0:
            word = idx2word_trans[idx]
            output_sentence.append(word)
        target_seq[0, 0] = idx
        states_value=[h,c]
    return ' '.join(output_sentence)
        
            
          
         
    

In [92]:
i = np.random.choice(len(input_texts))
input_seq = encoder_inputs[i:i+1]
translation = decode_sequence(input_seq)
print('-')
print('Input:', input_texts[i])
print('Translation:', translation)

-
Input: I found that book interesting.
Translation: আমার বইটা আকর্ষণীয় বলে মনে হয়েছিল।


In [75]:
def translation(input):
    sentence=[input]
    sequence=tokenizer_input.texts_to_sequences(sentence)
    input_seq=pad_sequences(sequence,max_len_input)
    print(decode_sequence(input_seq))



In [99]:
sentence=["I wor"]
sequence=tokenizer_input.texts_to_sequences(sentence)
input_seq=pad_sequences(sequence,max_len_input)
print(decode_sequence(input_seq))

আমি জাপানে থাকি।
