In [214]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
import re
import os
import tensorflow as tf
from sklearn.utils import shuffle
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model, load_model

In [2]:
!gdown --id 101S8yZESRK5YL0a886tTz1hnCFzhaRxe

Downloading...
From: https://drive.google.com/uc?id=101S8yZESRK5YL0a886tTz1hnCFzhaRxe
To: /content/en-fa_MT_dataset.csv
100% 55.4M/55.4M [00:00<00:00, 137MB/s]


In [3]:
en_fa_data = pd.read_csv('en-fa_MT_dataset.csv')

In [4]:
en_fa_data

Unnamed: 0,en,fa
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .
3,no .,نه .
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .
...,...,...
612081,zodiac .,دايره‌البروج .
612082,zodiacal light .,حمره مغربيه .
612083,zombi .,انسان زنده شد .
612084,zombiism .,مارخداگرائي .


In [5]:
en_fa_data = en_fa_data.sample(n=300000, random_state=50)

In [6]:
en_fa_data.shape

(300000, 2)

In [7]:
en_fa_data['en'] = en_fa_data['en'].apply(lambda x: x.lower())
en_fa_data['fa'] = en_fa_data['fa'].apply(lambda x: x.lower())

In [8]:
en_fa_data['en'] = en_fa_data['en'].apply(lambda x : '_start_ '+ x + ' _end_')
en_fa_data['fa'] = en_fa_data['fa'].apply(lambda x : '_start_ '+ x + ' _end_')

In [143]:
en_fa_data.head()

Unnamed: 0,en,fa
550281,_start_ there is a messenger from the imperial...,_start_ يك پيغام رسان از محل امپراطوري داريم _...
326616,_start_ could offer her what she wanted . _end_,_start_ ميتونست چيزي را كه اون ميخواست بده . _...
380457,_start_ throw away the tendons and membranes _...,_start_ بافتها و تاندونهایش را دور بریزید _end_
353927,_start_ oh what the hell . _end_,_start_ اوه ، اون ديگه چي بود . _end_
407398,"_start_ i've been hassled over a year, damn it...",_start_ بيشتر از يك ساله كه دارم وقتم رو تلف م...


In [10]:
len(en_fa_data)

300000

In [11]:
max_features1 = 5000
maxlen1 = 30

In [12]:
max_features2 = 5000
maxlen2 = 30

In [13]:
tok1 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features1) 
tok1.fit_on_texts(list(en_fa_data['en']))
tf_train_english = tok1.texts_to_sequences(list(en_fa_data['en']))
tf_train_english = tf.keras.preprocessing.sequence.pad_sequences(tf_train_english,
                                                                 maxlen = maxlen1)

In [14]:
tok2 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features2, filters='*') 
tok2.fit_on_texts(list(en_fa_data['fa']))
tf_train_fa = tok2.texts_to_sequences(list(en_fa_data['fa']))
tf_train_fa = tf.keras.preprocessing.sequence.pad_sequences(tf_train_fa,
                                                            maxlen = maxlen2,
                                                            padding = 'post') 

In [15]:
tf_train_english

array([[   0,    0,    0, ...,  606,  355,    2],
       [   0,    0,    0, ...,   61,  247,    2],
       [   0,    0,    0, ...,    4,    8,    2],
       ...,
       [   0,    0,    0, ...,  574,  651,    2],
       [   0,    0,    0, ...,   11,   16,    2],
       [   0,    0,    0, ...,    8, 3908,    2]], dtype=int32)

In [16]:
encoder_input_data = tf_train_english

In [17]:
encoder_input_data.shape

(300000, 30)

In [18]:
decoder_input_data = tf_train_fa[:, :-1]
decoder_target_data = tf_train_fa[:, 1:]

print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')

Shape of decoder input: (300000, 29)
Shape of decoder target: (300000, 29)


In [19]:
doc_length = encoder_input_data.shape[1]
doc_length

30

In [20]:
vocab_size_encoder = len(tok1.word_index) + 1 
vocab_size_decoder = len(tok2.word_index) + 1

In [21]:
latent_dim = 40

In [22]:
#### Encoder Model ####
encoder_inputs = tf.keras.Input(shape=(doc_length,), name='Encoder-Input')
x = tf.keras.layers.Embedding(vocab_size_encoder, latent_dim, name='Body-Word-Embedding',
                              mask_zero=False)(encoder_inputs)
x = tf.keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)
_, state_h = tf.keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)
encoder_model = tf.keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

In [23]:
#### Decoder Model ####
decoder_inputs = tf.keras.Input(shape=(None,), name='Decoder-Input')

dec_emb = tf.keras.layers.Embedding(vocab_size_decoder, latent_dim, 
                                    name='Decoder-Word-Embedding', 
                                    mask_zero=False)(decoder_inputs)
dec_bn = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
decoder_gru = tf.keras.layers.GRU(latent_dim, return_state=True,
                                  return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) 
x = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)
decoder_dense = tf.keras.layers.Dense(vocab_size_decoder, activation='softmax',
                                      name='Final-Output-Dense')

decoder_outputs = decoder_dense(x)

In [24]:
seq2seq_Model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), 
                      loss='sparse_categorical_crossentropy')

  super(Nadam, self).__init__(name, **kwargs)


In [25]:
seq2seq_Model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Decoder-Input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 Decoder-Word-Embedding (Embedd  (None, None, 40)    3965520     ['Decoder-Input[0][0]']          
 ing)                                                                                             
                                                                                                  
 Encoder-Input (InputLayer)     [(None, 30)]         0           []                               
                                                                                                  
 Decoder-Batchnorm-1 (BatchNorm  (None, None, 40)    160         ['Decoder-Word-Embedding[0][0

In [26]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, 
                             monitor='val_loss', 
                             save_best_only=True,
                             mode='min')

In [27]:
batch_size = 128
epochs = 20

In [28]:
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data],
                            np.expand_dims(decoder_target_data,-1),
                            batch_size=batch_size,  epochs=epochs, 
                            validation_split=0.12,
                            callbacks=[checkpoint]) 

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 14/20
Epoch 15/20
Epoch 15/20
Epoch 16/20
Epoch 16/20
Epoch 17/20
Epoch 17/20
Epoch 18/20
Epoch 18/20
Epoch 19/20
Epoch 19/20
Epoch 20/20
Epoch 20/20


In [32]:
seq2seq_Model.save('MT_model.h5')

### **Load Model**

In [80]:
!gdown --id 1-2zYxj6wezfZSeOKyG1EgmP_Uj3iY3kt

Downloading...
From: https://drive.google.com/uc?id=1-2zYxj6wezfZSeOKyG1EgmP_Uj3iY3kt
To: /content/MT_model.h5
100% 123M/123M [00:00<00:00, 259MB/s]


In [83]:
seq2seq_Model = load_model('MT_model.h5')

In [84]:
seq2seq_Model

<keras.engine.functional.Functional at 0x7f21dd78c290>

### **Evaluate Model**

In [248]:
def translate(test_text):
  decoded_sentence = []
  stop_condition = False
  raw_tokenized = tok1.texts_to_sequences(test_text)
  raw_tokenized = tf.keras.preprocessing.sequence.pad_sequences(raw_tokenized, maxlen=maxlen1)
  body_encoding = encoder_model.predict(raw_tokenized)
  latent_dim = seq2seq_Model.get_layer('Decoder-Word-Embedding').output_shape[-1]

  decoder_inputs = seq2seq_Model.get_layer('Decoder-Input').input
  dec_emb = seq2seq_Model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
  dec_bn = seq2seq_Model.get_layer('Decoder-Batchnorm-1')(dec_emb)
  gru_inference_state_input = tf.keras.Input(shape=(latent_dim,), name='hidden_state_input')
  gru_out, gru_state_out = seq2seq_Model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])
  dec_bn2 = seq2seq_Model.get_layer('Decoder-Batchnorm-2')(gru_out)
  dense_out = seq2seq_Model.get_layer('Final-Output-Dense')(dec_bn2)
  decoder_model = tf.keras.Model([decoder_inputs, gru_inference_state_input],[dense_out, gru_state_out])
  original_body_encoding = body_encoding
  state_value = np.array(tok2.word_index['_start_']).reshape(1, 1)
  vocabulary_inv = dict((v, k) for k, v in tok2.word_index.items())
  while not stop_condition:
    preds, st = decoder_model.predict([state_value, body_encoding])
    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = vocabulary_inv[pred_idx]
    print(pred_word_str)
    if pred_word_str == '_end_' or len(decoded_sentence) >= maxlen2:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)
    body_encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)
    return decoded_sentence

In [252]:
test_text = 'hello'

In [253]:
translate(test_text)

ValueError: ignored