<a href="https://colab.research.google.com/github/Arijit02/Machine-Learning-and-Deep-Learning/blob/master/Spanish_English.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 1.x
import tensorflow as tf
import numpy as np
import keras
from keras.models import Model
from keras.layers import Embedding, GRU, Dense, Input
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from os import path
from google.colab import files

TensorFlow 1.x selected.


Using TensorFlow backend.


In [2]:
print(tf.__version__)

1.15.2


In [3]:
# Loading Data #
srcLangPath = list(files.upload().keys())[0]
destLangPath = list(files.upload().keys())[0]

Saving spanish.txt to spanish.txt


Saving english.txt to english.txt


In [4]:
mark_start = 'ssss'
mark_end = 'eeee'

def load_data(path, start="", end=""):
    with open(path, encoding="utf-8") as file:
        texts = [start + line.strip() + end for line in file]

    return texts


data_src = load_data(srcLangPath)
data_dest = load_data(destLangPath, mark_start, mark_end)

In [5]:
# Tokenizer #
num_words = 10000

class TokenizerWrap(Tokenizer):
    def __init__(self, texts, padding, reverse=False, num_words=None):
        Tokenizer.__init__(self, num_words=num_words)
        self.fit_on_texts(texts)
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))
        self.tokens = self.texts_to_sequences(texts)

        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'

        self.num_tokens = [len(x) for x in self.tokens]
        self.max_tokens = np.mean(self.num_tokens) + \
            2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)
        self.tokens_padded = pad_sequences(
            self.tokens, maxlen=self.max_tokens, padding=padding, truncating=truncating)

    def token_to_words(self, token):
        word = " " if token == 0 else self.index_to_word[token]
        return word

    def tokens_to_text(self, tokens):
        words = [self.index_to_word(token) for token in tokens if token != 0]
        text = " ".join(words)
        return text

    def text_to_tokens(self, text, reverse=False, padding=False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            tokens = np.flip(tokens, axis=1)
            truncating = 'pre'
        else:
            truncating = 'post'

        tokens = pad_sequences(
            tokens, maxlen=self.max_tokens, padding='pre', truncating=truncating)

        return tokens

In [6]:
tokenizer_src = TokenizerWrap(
    texts=data_src, padding='pre', reverse=True, num_words=num_words)

tokenizer_dest = TokenizerWrap(
    texts=data_dest, padding='post', reverse=False, num_words=num_words)

tokens_src = tokenizer_src.tokens_padded
tokens_dest = tokenizer_dest.tokens_padded

In [7]:
print(tokens_src.shape, tokenizer_src.max_tokens)
print(tokens_dest.shape, tokenizer_dest.max_tokens)

(10999, 59) 59
(10999, 57) 57


In [8]:
token_start = tokenizer_dest.word_index[mark_start.strip()]
token_end = tokenizer_dest.word_index[mark_end.strip()]
print(token_start, token_end)

204 2


In [9]:
# Training Data #
encoder_input_data = tokens_src

decoder_input_data = tokens_dest[:, :-1]
decoder_output_data = tokens_dest[:, 1:]

In [10]:
idx = 2

In [11]:
decoder_input_data[idx]

array([1009,   21,   37,   23,   18,  673,    1, 7324, 7325, 7326, 1862,
          4, 5722,  185,    1,   98,    6,    9,  259,    3,  119, 1943,
          9, 1703,    3, 1224,  972,    8, 1339,  113, 4725,    2,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

In [12]:
decoder_output_data[idx+1]

array([  18, 1438,    9,  124,   14,   11,  340,    6,    1,  171,    3,
          1,  276,  269,  973,  375,   11,  215, 1223,    2,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

In [13]:
# Creating Neural Network #

# Creating Encoder
encoder_input = Input(shape=(None, ), name='encoder_input')

embedding_size = 128
encoder_embedding = Embedding(
    input_dim=num_words, output_dim=embedding_size, name='encoder_embedding')

state_size = 512
encoder_gru1 = GRU(state_size, name='encoder_gru1', return_sequences=True)
encoder_gru2 = GRU(state_size, name='encoder_gru2', return_sequences=True)
encoder_gru3 = GRU(state_size, name='encoder_gru3', return_sequences=False)


def connect_encoder():
    net = encoder_input
    net = encoder_embedding(net)
    net = encoder_gru1(net)
    net = encoder_gru2(net)
    net = encoder_gru3(net)
    encoder_output = net

    return net


encoder_output = connect_encoder()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [14]:
# Creating Decoder
decoder_initial_state = Input(
    shape=(state_size, ), name='decoder_initial_state')

decoder_input = Input(shape=(None, ), name='decoder_input')

decoder_embedding = Embedding(
    input_dim=num_words, output_dim=embedding_size, name='decoder_embedding')


decoder_gru1 = GRU(state_size, name='decoder_gru1', return_sequences=True)
decoder_gru2 = GRU(state_size, name='decoder_gru2', return_sequences=True)
decoder_gru3 = GRU(state_size, name='decoder_gru3', return_sequences=True)

decoder_dense = Dense(num_words, activation='linear', name='decoder_output')


def connect_decoder(initial_state):
    net = decoder_input
    net = decoder_embedding(net)
    net = decoder_gru1(net, initial_state=initial_state)
    net = decoder_gru2(net, initial_state=initial_state)
    net = decoder_gru3(net, initial_state=initial_state)
    decoder_output = decoder_dense(net)

    return decoder_output

In [15]:
# Connect And Create The Models #
decoder_output = connect_decoder(initial_state=encoder_output)
model_train = Model(
    inputs=[encoder_input, decoder_input], outputs=[decoder_output])

model_encoder = Model(inputs=[encoder_input], outputs=[encoder_output])

decoder_output = connect_decoder(initial_state=decoder_initial_state)
model_decoder = Model(
    inputs=[decoder_input, decoder_initial_state], outputs=[decoder_output])

In [16]:
model_train.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
encoder_embedding (Embedding)   (None, None, 128)    1280000     encoder_input[0][0]              
__________________________________________________________________________________________________
encoder_gru1 (GRU)              (None, None, 512)    984576      encoder_embedding[0][0]          
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, None)         0                                            
____________________________________________________________________________________________

In [17]:
# Compile The Model #
'''
model_train.compile(optimizer=RMSprop(lr=1e-3),
                    loss='sparse_categorical_crossentropy')'''

def sparse_cross_entropy(y_true, y_pred):
  loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
  loss_mean = tf.reduce_mean(loss)

  return loss_mean


decoder_target = tf.placeholder(dtype='int32', shape=(None, None))
model_train.compile(optimizer=RMSprop(lr=1e-3), loss=sparse_cross_entropy, target_tensors=[decoder_target])

In [26]:
# Callback Functions #
path_checkpoint = '21_checkpoint.keras'
callback_checkpoint = ModelCheckpoint(
    filepath=path_checkpoint, verbose=1, save_best_only=True, save_weights_only=True)

callback_early_stopping = EarlyStopping(patience=3, verbose=1)

callback_tensorboard = TensorBoard(log_dir='./21_logs/', write_graph=False)

callbacks = [callback_checkpoint,
             callback_early_stopping, callback_tensorboard]

In [32]:
# Load Checkpoint #
try:
    model_train.load_weights(path_checkpoint)
except Exception as error:
    print("Error trying to load checkpoint")
    print(error)

In [42]:
# Train The Model #

x_data = \
    {
        'encoder_input': encoder_input_data,
        'decoder_input': decoder_input_data
    }

y_data = \
    {
        'decoder_output': decoder_output_data
    }

validation_split = 1000 / len(encoder_input_data)

model_train.fit(x=x_data, y=y_data, epochs=10, batch_size=64,
                callbacks=callbacks, validation_split=validation_split)

Train on 9999 samples, validate on 1000 samples
Epoch 1/10

Epoch 00001: val_loss did not improve from 2.25424
Epoch 2/10

Epoch 00002: val_loss did not improve from 2.25424
Epoch 3/10

Epoch 00003: val_loss did not improve from 2.25424
Epoch 4/10

Epoch 00004: val_loss did not improve from 2.25424
Epoch 00004: early stopping


<keras.callbacks.callbacks.History at 0x7f43603c6320>

In [43]:
def translate(input_text, true_output_text=None):
  input_tokens = tokenizer_src.text_to_tokens(text=input_text, padding=True, reverse=True)
  initial_state = model_encoder.predict(input_tokens)
  max_tokens = tokenizer_dest.max_tokens
  shape = (1, max_tokens)
  decoder_input_data = np.zeros(shape=shape, dtype=np.int)
  token_int = token_start
  output_text = ''
  count_tokens = 0
  while token_int != token_end and count_tokens < max_tokens:
    decoder_input_data[0, count_tokens] = token_int
    x_data = \
      {
          'decoder_initial_state' : initial_state,
          'decoder_input' : decoder_input_data  
      }
    decoder_output = model_decoder.predict(x_data)
    token_onehot = decoder_output[0, count_tokens, :]
    token_int = np.argmax(token_onehot)
    sampled_word = tokenizer_dest.token_to_words(token_int)
    output_text += ' ' + sampled_word
    count_tokens += 1

  output_tokens = decoder_input_data[0]

  print("Input text : ")
  print(input_text)
  print()

  print("Output text : ")
  print(output_text)
  print()

  if true_output_text is not None:
    print("True output text : ")
    print(true_output_text)
    print()

In [44]:
idx = 2111
translate(input_text=data_src[idx], true_output_text=data_dest[idx])
input_text = 'Es un milagro que se salvaran.'
true_output_text = 'It is a miracle that they were saved.'
translate(input_text=input_text, true_output_text=true_output_text)

Input text : 
Conque vuelta a empezar como hace veinte años con el Amoco, un golpe en la costa septentrional, otro en la costa meridional, otro en la costa occidental.

Output text : 
 of the council were not been aware of the fpö of the commission and the fpö of the austrian people' s party and the commission has been aware of the austrian government eeee

True output text : 
ssssSo we are now experiencing the same thing that happened 20 years ago with the Amoco Cadiz; a wreck on the North coast, a wreck on the South coast, a wreck on the West coast.eeee

Input text : 
Es un milagro que se salvaran.

Output text : 
 it is not eeee

True output text : 
It is a miracle that they were saved.

