#Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, Attention
from tensorflow.keras.layers import TextVectorization
import pathlib
import random
import string
import re
from tensorflow import keras
from tensorflow.keras import layers
import sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split

#Data

In [2]:
!kaggle datasets download -d lonnieqin/englishspanish-translation-dataset

Dataset URL: https://www.kaggle.com/datasets/lonnieqin/englishspanish-translation-dataset
License(s): unknown
Downloading englishspanish-translation-dataset.zip to /content
100% 2.72M/2.72M [00:00<00:00, 4.98MB/s]
100% 2.72M/2.72M [00:00<00:00, 4.26MB/s]


In [3]:
!unzip englishspanish-translation-dataset.zip

Archive:  englishspanish-translation-dataset.zip
  inflating: data.csv                


##Preprocessing

In [4]:
data = pd.read_csv('/content/data.csv')
data #DataFrame

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
118959,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


In [5]:
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub('\[.*?\]', '', text)  # Remove text within square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub('<.*?>+', '', text)  # Remove HTML tags
    text = re.sub('\n', '', text)  # Replace newlines with spaces
    text = re.sub(r'[^\w]', ' ', text)  # Remove non-alphanumeric characters except spaces
    text = re.sub('\w*\d\w*', '', text)  # Remove words containing digits
#    text = re.sub('\s+', ' ', text).strip()  # Normalize multiple spaces to a single space and trim leading/trailing spaces
    return text

data.english = data.english.map(clean_text)
data.spanish = data.spanish.map(clean_text)

In [6]:
def add_start_end(text):
  text = f'<start> {text} <end>'
  return text

data.english = data.english.map(add_start_end)
data.spanish = data.spanish.map(add_start_end)

In [7]:
data

Unnamed: 0,english,spanish
0,<start> go <end>,<start> ve <end>
1,<start> go <end>,<start> vete <end>
2,<start> go <end>,<start> vaya <end>
3,<start> go <end>,<start> váyase <end>
4,<start> hi <end>,<start> hola <end>
...,...,...
118959,<start> there are four main causes of alcohol ...,<start> hay cuatro causas principales de muert...
118960,<start> there are mothers and fathers who will...,<start> hay madres y padres que se quedan desp...
118961,<start> a carbon footprint is the amount of ca...,<start> una huella de carbono es la cantidad d...
118962,<start> since there are usually multiple websi...,<start> como suele haber varias páginas web so...


In [8]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='!"#$%&()*+,-./:;=?@[\]^_`{|}~\t\n', oov_token=''
  )
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
  return tensor, lang_tokenizer

In [9]:
eng_sequence, eng_tokenizer = tokenize(data.english)
spn_sequence, spn_tokenizer = tokenize(data.spanish)

In [10]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(eng_sequence,
                spn_sequence, test_size = 0.1, random_state=42)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((107067, 49), (11897, 49), (107067, 51), (11897, 51))

#Converter

In [11]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print('%d---> %s' % (t, lang.index_word[t]))

print('English')
convert(eng_tokenizer, x_train[0])
print()
print('Spanish')
convert(spn_tokenizer, y_train[0])

English
2---> <start>
45---> there
74---> were
140---> two
1461---> pieces
17---> of
477---> cake
3---> <end>

Spanish
2---> <start>
103---> había
81---> dos
4003---> pedazos
4---> de
919---> torta
3---> <end>


In [12]:
vocab_inp_size = len(eng_tokenizer.word_index)+1
vocab_tar_size =  len(spn_tokenizer.word_index)+1
embedding_dim = 256
units = 1024
batch_size=32

#Creating DataSet

In [13]:
def create_dataset(x, y, batch_size=32):
  data = tf.data.Dataset.from_tensor_slices((x, y))

  data = data.shuffle(1028)
  data = data.batch(batch_size, drop_remainder=True)

  data = data.prefetch(tf.data.experimental.AUTOTUNE)

  return data

train_dataset = create_dataset(x_train, y_train)
test_dataset = create_dataset(x_test, y_test)

In [14]:
for eng, spn in train_dataset.take(1):
  print(f'English:{eng.shape}\n{eng}')

  print(f'Spanish:{spn.shape}\n{spn}')

English:(32, 49)
[[   2   12  225 ...    0    0    0]
 [   2   46   51 ...    0    0    0]
 [   2   12   75 ...    0    0    0]
 ...
 [   2    4  512 ...    0    0    0]
 [   2    8 1157 ...    0    0    0]
 [   2   27   10 ...    0    0    0]]
Spanish:(32, 51)
[[   2   16   14 ...    0    0    0]
 [   2   15 9165 ...    0    0    0]
 [   2    7  287 ...    0    0    0]
 ...
 [   2    7   53 ...    0    0    0]
 [   2    8 1767 ...    0    0    0]
 [   2  158   12 ...    0    0    0]]


#Encoder

In [15]:
# Encoder class
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
    super(Encoder, self).__init__()
    self.batch_size = batch_size
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(self.enc_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')

  def call(self, x):
    x = self.embedding(x)
    output, state_h, state_c = self.lstm(x)
    return output, state_h, state_c

# Attention class
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    hidden_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(
      self.W1(values) + self.W2(hidden_with_time_axis)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

#Decoder

In [16]:
# Decoder class
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
    super(Decoder, self).__init__()
    self.batch_size = batch_size
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(self.dec_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state_h, state_c = self.lstm(x)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)
    return x, state_h, state_c, attention_weights

In [17]:
# Define Encoder, Decoder, and Optimizer
encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

#Training Loop

In [18]:
import tensorflow as tf
import tqdm

# Set up mixed precision policy for faster training
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden_h, enc_hidden_c = encoder(inp)
    dec_hidden_h, dec_hidden_c = enc_hidden_h, enc_hidden_c
    dec_input = tf.expand_dims([spn_tokenizer.word_index['<start>']] * batch_size, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden_h, dec_hidden_c, _ = decoder(dec_input, dec_hidden_h, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = loss / int(targ.shape[1])
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

EPOCHS = 4

# Prefetch data for faster training
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

for epoch in range(EPOCHS):
  total_loss = 0
  num_batches = 0
  enc_hidden = [tf.zeros((batch_size, units)) for _ in range(2)]

  # Wrap the train dataset with tqdm for progress monitoring
  with tqdm.tqdm(total=len(train_dataset), desc=f'Epoch {epoch + 1}') as pbar:
    for (batch, (inp, targ)) in enumerate(train_dataset):
      batch_loss = train_step(inp, targ, enc_hidden)
      total_loss += batch_loss
      num_batches += 1
      pbar.update(1)

    avg_loss = total_loss / num_batches
    print(f'Epoch {epoch + 1} Loss {avg_loss.numpy():.4f}')

Epoch 1: 100%|██████████| 3345/3345 [35:34<00:00,  1.57it/s]


Epoch 1 Loss 0.6869


Epoch 2: 100%|██████████| 3345/3345 [35:03<00:00,  1.59it/s]


Epoch 2 Loss 0.4252


Epoch 3: 100%|██████████| 3345/3345 [35:03<00:00,  1.59it/s]


Epoch 3 Loss 0.2672


Epoch 4: 100%|██████████| 3345/3345 [35:03<00:00,  1.59it/s]

Epoch 4 Loss 0.1841





#Translation Process

In [47]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the maximum lengths
max_len_input = 20  # Adjust based on your dataset
max_len_target = 20  # Adjust based on your dataset

def evaluate(sentence):
  sentence = clean_text(sentence)
  sentence = '<start> ' + sentence + ' <end>'
  inputs = [eng_tokenizer.word_index.get(word, 0) for word in sentence.split(' ')]
  inputs = pad_sequences([inputs], maxlen=max_len_input, padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''
  enc_output, enc_hidden_h, enc_hidden_c = encoder(inputs)
  dec_hidden_h, dec_hidden_c = enc_hidden_h, enc_hidden_c
  dec_input = tf.expand_dims([spn_tokenizer.word_index['<start>']], 0)

  for t in range(max_len_target):
    predictions, dec_hidden_h, dec_hidden_c, _ = decoder(dec_input, dec_hidden_h, enc_output)
    predicted_id = tf.argmax(predictions[0]).numpy()

    if spn_tokenizer.index_word[predicted_id] == '<end>':
      return result.strip()

    result += spn_tokenizer.index_word[predicted_id] + ' '

    dec_input = tf.expand_dims([predicted_id], 0)

  return result.strip()

def translate():
    count = 0
    while count < 10:
        sentence = input("Enter a sentence: ")
        result = evaluate(sentence)
        print(f'Translated: {result}')
        print("-" * 50)
        count += 1

translate()

Enter a sentence: hi
Translated: hola
--------------------------------------------------
Enter a sentence: how are you
Translated: cómo estás
--------------------------------------------------
Enter a sentence: I want cheese
Translated: quiero queso
--------------------------------------------------
Enter a sentence: I love you
Translated: te amo
--------------------------------------------------
Enter a sentence: I know how to speak english
Translated: sé cómo hablar inglés
--------------------------------------------------
Enter a sentence: I am hungry
Translated: tengo hambre
--------------------------------------------------
Enter a sentence: the sky is blue
Translated: el cielo es azul
--------------------------------------------------
Enter a sentence: have a great day
Translated: hay un gran día
--------------------------------------------------
Enter a sentence: what is your name?
Translated: qué impresión tienes tu nombre
--------------------------------------------------
Ente