In [1]:
# The Kaggle API client expects this file to be in ~/.kaggle,
# so move it there.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [2]:
!kaggle datasets download -d lonnieqin/englishspanish-translation-dataset

Dataset URL: https://www.kaggle.com/datasets/lonnieqin/englishspanish-translation-dataset
License(s): unknown
Downloading englishspanish-translation-dataset.zip to /content
 74% 2.00M/2.72M [00:01<00:00, 2.18MB/s]
100% 2.72M/2.72M [00:01<00:00, 2.44MB/s]


In [3]:
!unzip /content/englishspanish-translation-dataset.zip

Archive:  /content/englishspanish-translation-dataset.zip
  inflating: data.csv                


# Machine Translation

In [4]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import model_selection
import tqdm
import re

In [5]:
data_df=pd.read_csv("/content/data.csv")

In [6]:
data_df

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
118959,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [7]:
def clean_text(text):
  text=text.lower()
  text=re.sub("\[.*?\]","",text)
  text=re.sub("https?://\s+|www\.\s+","",text)
  text=re.sub("<.*?>+","",text)  #code html
  text=re.sub("\n"," ",text)
  text=re.sub(r"[^\w]"," ",text)
  text=re.sub("\w*\d\w*","",text)
  return text

data_df.english= data_df.english.map(clean_text)
data_df.spanish= data_df.spanish.map(clean_text)

In [8]:
data_df

Unnamed: 0,english,spanish
0,go,ve
1,go,vete
2,go,vaya
3,go,váyase
4,hi,hola
...,...,...
118959,there are four main causes of alcohol related ...,hay cuatro causas principales de muertes relac...
118960,there are mothers and fathers who will lie awa...,hay madres y padres que se quedan despiertos d...
118961,a carbon footprint is the amount of carbon dio...,una huella de carbono es la cantidad de contam...
118962,since there are usually multiple websites on a...,como suele haber varias páginas web sobre cual...




In [9]:
#start and End word
def add_start_end(text):
  text= f"<start> {text} <end>"
  return text

data_df.english= data_df.english.map(add_start_end)
data_df.spanish= data_df.spanish.map(add_start_end)

In [10]:
#Tokenization
def tokenize(lang):
  long_tokenizer= tf.keras.preprocessing.text.Tokenizer( filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n',oov_token="<oov>")
  long_tokenizer.fit_on_texts(lang)
  tensor= long_tokenizer.texts_to_sequences(lang)
  tensor= tf.keras.preprocessing.sequence.pad_sequences(tensor,padding="post")
  return tensor, long_tokenizer

eng_sequence,eng_tokenizer =tokenize(data_df.english)
spn_sequence,spn_tokenizer =tokenize(data_df.spanish)

In [11]:
#split
x_train,x_test,y_train,y_test=model_selection.train_test_split(eng_sequence,spn_sequence,test_size=0.1,random_state=42)
print(f"X_train Shape: {x_train.shape}")
print(f"X_test Shape: {x_test.shape}")
print(f"y_train Shape: {y_train.shape}")
print(f"y_test Shape: {y_test.shape}")

X_train Shape: (107067, 49)
X_test Shape: (11897, 49)
y_train Shape: (107067, 51)
y_test Shape: (11897, 51)


In [12]:
# show tokenize ----> text
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print("%d ----> %s" % (t, lang.index_word[t]))

print("Endlish")
convert(eng_tokenizer,x_train[0])
print("\n")
print("Spanish")
convert(spn_tokenizer,y_train[0])


Endlish
2 ----> <start>
45 ----> there
74 ----> were
140 ----> two
1461 ----> pieces
17 ----> of
477 ----> cake
3 ----> <end>


Spanish
2 ----> <start>
103 ----> había
81 ----> dos
4003 ----> pedazos
4 ----> de
919 ----> torta
3 ----> <end>


In [13]:
vocab_inp_size=len(eng_tokenizer.word_index)+1    #1--> padding
vocab_tar_size=len(spn_tokenizer.word_index)+1
batch_size=32
units=1024
embedding_dims=256  #hidden layer


In [14]:
# pipline model
def create_dataset(x,y,batch_size=32):
  data=tf.data.Dataset.from_tensor_slices((x,y))
  data=data.shuffle(1028)
  data=data.batch(batch_size,drop_remainder=True)
  data=data.prefetch(tf.data.experimental.AUTOTUNE)
  return data
train_dataset=create_dataset(x_train,y_train)
test_dataset=create_dataset(x_test,y_test)

In [15]:
for eng, span in train_dataset.take(1):
  print(f"Endlish: {eng.shape}")
  print(f"spanish: {span.shape}")


Endlish: (32, 49)
spanish: (32, 51)


In [16]:
# Encoding
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size,embedding_dims,encoder_units,batch_size):
        super(Encoder, self).__init__()

        self.batch_size = batch_size
        self.encoder_units=encoder_units

        self.embedding= tf.keras.layers.Embedding(vocab_size,embedding_dims,mask_zero=True)
        self.gru= tf.keras.layers.GRU(encoder_units, return_sequences=True, return_state=True, recurrent_initializer="glorot_uniform")

    def call(self, x, hidden):
      x=self.embedding(x)
      output, state= self.gru(x, initial_state=hidden)
      return output, state


    def initilaize_hidden_state(self):
      return tf.zeros((self.batch_size, self.encoder_units))



In [17]:
# Decoder
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
      super(Decoder, self).__init__()

      self.batch_size = batch_size
      self.decoder_units = decoder_units
      self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
      self.gru = tf.keras.layers.GRU(self.decoder_units,
                                           return_sequences=True,
                                           return_state=True,
                                           recurrent_initializer = 'glorot_uniform')

      self.fc = tf.keras.layers.Dense(vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, hidden = self.gru(x, initial_state = hidden)
    output = tf.reshape(output, (-1, output.shape[2]))
    x =  tf.nn.softmax(self.fc(output))
    return x, hidden


In [18]:
encoder=Encoder(vocab_inp_size, embedding_dims, units, batch_size )
sample_hidden= encoder.initilaize_hidden_state()
sample_output, sample_hidden= encoder(eng, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (32, 49, 1024)
Encoder Hidden state shape: (batch size, units) (32, 1024)


In [19]:
decoder = Decoder(vocab_tar_size, embedding_dims, units, batch_size)

sample_decoder_output, _ = decoder(tf.random.uniform((batch_size, 1)), sample_hidden)

print ('Decoder output shape: (batch size, vocab_size) {}'.format(sample_decoder_output.shape))


Decoder output shape: (batch size, vocab_size) (32, 25768)


In [20]:
# create the optimizer using the Adam optimizer
optimizer = tf.keras.optimizers.Adam()
# create the loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction='none')

# define the loss function for the training
def loss_function(real, pred):
  # create the mask to ignore the padding tokens
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  # mask shape == (batch_size, sequence_length)
  # calculate the loss
  loss_ = loss_object(real, pred)
  # mask the loss
  # how the mask works:
  # if the value is 1, the loss is calculated
  # if the value is 0, the loss is ignored
    #[1,1,1,1,1,1,0,0,0,0,0] mask
    # *
    #[2,6,2,1,6,3,2,1,5,7,9] input
    # =
    #[2,6,2,1,6,3,0,0,0,0,0] output
  mask = tf.cast(mask, dtype=loss_.dtype)
  # mask shape == (batch_size, sequence_length)

  loss_ *= mask
  # calculate the average loss per batch
  return tf.reduce_mean(loss_)



In [21]:
# create the training metric
train_loss = tf.metrics.Mean(name='train loss')
# create the testing metric
test_loss =tf.metrics.Mean(name='test loss')

In [22]:
# # create the training step
# # using the tf.function decorator to speed up the training process by converting the training function to a TensorFlow graph
# @tf.function
# # define the training step
# def train_step(inputs, target, enc_hidden):
#   # the encoder_hidden is the initial hidden state of the encoder
#   # enc_hidden shape == (batch_size, hidden_size)

#   # inilaize the loss to zero
#   loss = 0
#   # create the gradient tape to record the gradient of the loss with respect to the weights

#   with tf.GradientTape() as tape:
#     # pass the input to the encoder
#     # enc_output shape == (batch_size, 49, hidden_size)
#     # enc_hidden shape == (batch_size, hidden_size)
#     # using the encoder to get the encoder_output and the encoder_hidden
#     # using the encoder_hidden as the initial hidden state of the decoder
#     enc_output, enc_hidden = encoder(inputs, enc_hidden)
#     # set the initial decoder hidden state to the encoder hidden state
#     dec_hidden = enc_hidden

#     # create the start token
#     # start_token shape == (batch_size, 1)
#     # repeat the start token for the batch size times
#     dec_input = tf.expand_dims([spn_tokenizer.word_index['']] * inputs.shape[0], 1)

#     # Teacher forcing - feeding the target as the next input

#     for t in range(1, target.shape[1]):
#       # passing enc_output to the decoder
#       predictions, dec_hidden = decoder(dec_input, dec_hidden)
#       # calculate the loss for the current time step using the loss function
#       loss += loss_function(target[:, t], predictions)

#       # using teacher forcing
#       dec_input = tf.expand_dims(target[:, t], 1)
#   # calculate the loss for the current batch
#   batch_loss = (loss / int(target.shape[1]))

#   # get the trainable variables
#   variables = encoder.trainable_variables + decoder.trainable_variables
#   # calculate the gradients using the tape
#   gradients = tape.gradient(loss, variables)
#   # update the trainable variables
#   optimizer.apply_gradients(zip(gradients, variables))
#   # add the loss to the training loss metric
#   train_loss(batch_loss)
#   return batch_loss





# ensure that the spn_tokenizer.word_index dictionary contains the empty string
spn_tokenizer.word_index[''] = 0

# create the training step
@tf.function
def train_step(inputs, target, enc_hidden):
  # the encoder_hidden is the initial hidden state of the encoder
  # enc_hidden shape == (batch_size, hidden_size)

  # inilaize the loss to zero
  loss = 0
  # create the gradient tape to record the gradient of the loss with respect to the weights

  with tf.GradientTape() as tape:
    # pass the input to the encoder
    # enc_output shape == (batch_size, 49, hidden_size)
    # enc_hidden shape == (batch_size, hidden_size)
    # using the encoder to get the encoder_output and the encoder_hidden
    # using the encoder_hidden as the initial hidden state of the decoder
    enc_output, enc_hidden = encoder(inputs, enc_hidden)
    # set the initial decoder hidden state to the encoder hidden state
    dec_hidden = enc_hidden

    # create the start token
    # start_token shape == (batch_size, 1)
    # repeat the start token for the batch size times
    dec_input = tf.expand_dims([spn_tokenizer.word_index['']] * inputs.shape[0], 1)

    # Teacher forcing - feeding the target as the next input

    for t in range(1, target.shape[1]):
      predictions, dec_hidden = decoder(dec_input, dec_hidden)
      # calculate the loss for the current time step using the loss function
      loss += loss_function(target[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(target[:, t], 1)
  # calculate the loss for the current batch
  batch_loss = (loss / int(target.shape[1]))

  # get the trainable variables
  variables = encoder.trainable_variables + decoder.trainable_variables
  # calculate the gradients using the tape
  gradients = tape.gradient(loss, variables)
  # update the trainable variables
  optimizer.apply_gradients(zip(gradients, variables))
  # add the loss to the training loss metric
  train_loss(batch_loss)
  return batch_loss




In [23]:
# create the training step
# using the tf.function decorator to speed up the training process by converting the training function to a TensorFlow graph
@tf.function
def test_step(inputs, target, enc_hidden):
    # the encoder_hidden is the initial hidden state of the encoder
    # enc_hidden shape == (batch_size, hidden_size)
    # inilaize the loss to zero
    loss = 0
    # pass the input to the encoder
    # enc_output shape == (batch_size, 49, hidden_size)
    # enc_hidden shape == (batch_size, hidden_size)
    # using the encoder to get the encoder_output and the encoder_hidden
    enc_output, enc_hidden = encoder(inputs, enc_hidden)
    # set the initial decoder hidden state to the encoder hidden state
    dec_hidden = enc_hidden
    # create the start token
    # start_token shape == (batch_size, 1)
    # repeat the start token for the batch size times
    dec_input = tf.expand_dims([spn_tokenizer.word_index['']] * inputs.shape[0], 1)
    for t in range(1, target.shape[1]):
        # passing enc_output to the decoder with dec_hidden as the initial hidden state
        predictions, dec_hidden = decoder(dec_input, dec_hidden)
        # calculate the loss for the current time step using the loss function
        loss += loss_function(target[:, t], predictions)

        # using teacher forcing
        dec_input = tf.expand_dims(target[:, t], 1)
    # calculate the loss for the current batch
    batch_loss = (loss / int(target.shape[1]))
    # add the batch loss to the test loss metric
    test_loss(batch_loss)

In [68]:
save_best_model = tf.keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/AI project', save_best_only=True)

In [24]:
# set the epochs to 10
EPOCHS = 3
# set the old test loss to high number

old_test_loss=1000000
# create the training loop
for epoch in range(EPOCHS):
    # reset the training loss metric
    train_loss.reset_states()
    # reset the testing loss metric
    test_loss.reset_states()

    # initalize the hidden state of the encoder to zeros
    enc_hidden = encoder.initilaize_hidden_state()
    # create the training progress bar set the total number of batches to the length of the training dataset and the batch size to the test size
    steps_per_epoch =eng_sequence.shape[0]//batch_size #=> 4356 batch in the dataset
    bar = tf.keras.utils.Progbar(target=steps_per_epoch)

    count=0
    # iterate over the training dataset
    for (batch, (inputs, target)) in enumerate(train_dataset):
        # update the progress bar
        count += 1
        # run the training step
        batch_loss = train_step(inputs, target, enc_hidden)
        bar.update(count)  # manually update the progress bar




    # iterate over the testing dataset
    for (batch, (inputs, target)) in enumerate(test_dataset):
        count += 1
        # run the testing step
        batch_loss = test_step(inputs, target, enc_hidden)
        bar.update(count)

    # save the best performance model on the test dataset
    if old_test_loss> test_loss.result():
        # set the old test loss to the test loss
        old_test_loss= test_loss.result()
        encoder.save(filepath='/content/models/encoder')
        decoder.save(filepath='/content/models/decoder')
        print('Model is saved')
    # print the training and testing loss
    print('#' * 50)
    print(f'Epoch #{epoch + 1}')
    print(f'Training Loss {train_loss.result()}')
    print(f'Testing Loss {test_loss.result()}')
    print('#' * 50)


##################################################
Epoch #1
Training Loss 0.6609793901443481
Testing Loss 0.5006110072135925
##################################################
##################################################
Epoch #2
Training Loss 0.3923034071922302
Testing Loss 0.4223499894142151
##################################################
##################################################
Epoch #3
Training Loss 0.2636062502861023
Testing Loss 0.4110017716884613
##################################################


In [63]:
# create the translate function
# the translate function takes in the question as input and answers the input sentence
def translate(sentence):

  # clean the input question sentence
  sentence = clean_text(sentence)
  # add the start token to the sentence
  sentence =add_start_end(sentence)
  # tokenize the sentence
  inputs = eng_tokenizer.texts_to_sequences([sentence])
  # pad the sentence
  inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                         maxlen=29,
                                                         padding='post')

  # initalize the hidden state of the encoder to zeros
  hidden = [tf.zeros((1, units))]

  # pass the sentence to the encoder with the hidden state as the initial hidden state
  enc_out, enc_hidden = encoder(inputs, hidden)

  # set the initial decoder hidden state to the encoder hidden state
  dec_hidden = enc_hidden

  # create the start token
  # start_token shape == (batch_size, 1)
  # repeat the start token for the batch size times
  dec_input = tf.expand_dims([spn_tokenizer.word_index['<start>']], 0)

  # create the result string
  result = ''

  # loop over the length of the sentence (32)
  for t in range(32):
    # passing the encoder output and the decoder hidden state to the decoder make sure the decoder input is the previous predicted word
    predictions, dec_hidden = decoder(dec_input, dec_hidden)

    # getting the predicted word index
    predicted_id = tf.argmax(predictions[0]).numpy()

    # getting the predicted word using the predicted index
    # add the predicted word to the result string
    result += spn_tokenizer.index_word[predicted_id] + ' '

    # if the predicted word is the  token then stop the loop
    if spn_tokenizer.index_word[predicted_id] == '<end>':
      # remove the  and  tokens from the result string
      result = result.replace('<start>', '')
      result = result.replace('<end>','')
      # remove the  and  tokens from the sentence string
      sentence = sentence.replace('<start>', '')
      sentence = sentence.replace('<end>', '')
      return  sentence, result

    # using the predicted word as the next decoder input
    dec_input = tf.expand_dims([predicted_id], 0)

  # remove the  and  tokens from the result string
  result = result.replace('<start>', '')
  result = result.replace('<end>','')
  # remove the  and  tokens from the sentence string
  sentence = sentence.replace('<start>', '')
  sentence = sentence.replace('<end>', '')

  # return the result string and the original sentence
  return sentence, result


In [66]:
translate("there are four main causes of alcohol")

(' there are four main causes of alcohol ',
 'cuatro causas principales de muertes  ')

In [67]:
translate("there are mothers and fathers who wil")

(' there are mothers and fathers who wil ',
 'y elige ambas propios responsabilidades  ')

In [57]:
data_df.english

0                                         <start> go  <end>
1                                         <start> go  <end>
2                                         <start> go  <end>
3                                         <start> go  <end>
4                                         <start> hi  <end>
                                ...                        
118959    <start> there are four main causes of alcohol ...
118960    <start> there are mothers and fathers who will...
118961    <start> a carbon footprint is the amount of ca...
118962    <start> since there are usually multiple websi...
118963    <start> if you want to sound like a native spe...
Name: english, Length: 118964, dtype: object