In [1]:
import pathlib
import random
import string
import re
from string import digits

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

In [2]:
!unzip data.zip -d trans

Archive:  data.zip
  inflating: trans/Hindi_English_Truncated_Corpus.csv  


In [3]:
df = pd.read_csv("/content/trans/Hindi_English_Truncated_Corpus.csv")

In [4]:
df['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [5]:
df=df[df['source']=='ted']
df.head(5)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
13,ted,So there is some sort of justice,तो वहाँ न्याय है


In [6]:
english_data = df.english_sentence.values
hindi_data = df.hindi_sentence.values

In [7]:
remove_digits = str.maketrans('', '', digits)
exclude = set(string.punctuation)
def clean_english(text_data):
  cleaned =[]
  for line in text_data:
    line = line.lower()
    line = re.sub("'" ,'',line)
    line = "".join([ch for ch in line if ch not in exclude])
    line =line.translate(remove_digits)
    line = line.strip()
    line =re.sub(" +", " " ,line)

    cleaned.append(line)

  return cleaned

def clean_hindi(text_data):
  cleaned =[]
  for line in text_data:
    line = line.lower()
    line = re.sub("'" ,'',line)
    line = "".join([ch for ch in line if ch not in exclude])
    line =line.translate(remove_digits)
    line =re.sub("[२३०८१५७९४६]","",line)
    line = line.strip()
    line =re.sub(" +", " " ,line)
    line = 'START_ ' + line + " _END" 

    cleaned.append(line)

  return cleaned
clean_data_eng=clean_english(english_data)
clean_data_hin = clean_hindi(hindi_data)

In [8]:
import gc
gc.collect()

163

In [9]:
len(clean_data_eng),len(clean_data_hin)

text_pairs =[]

for i in range(len(clean_data_eng)):
  text_pairs.append((clean_data_eng[i],clean_data_hin[i]))

In [10]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")


39881 total pairs
27917 training pairs
5982 validation pairs
5982 test pairs


In [11]:
vocab_size = 10000
sequence_length = 20
batch_size = 64

In [12]:
train_eng_texts = [pair[0] for pair in train_pairs]
train_hin_texts = [pair[1] for pair in train_pairs]

In [13]:
eng_vectorization = TextVectorization(max_tokens=vocab_size,output_mode='int',
                                      output_sequence_length =sequence_length )

hin_vectorization = TextVectorization(max_tokens=vocab_size , output_mode="int",
                                      output_sequence_length=sequence_length+1)



In [14]:
eng_vectorization.adapt(train_eng_texts)
hin_vectorization.adapt(train_hin_texts)


In [15]:
def format_dataset(eng,hin):
  eng = eng_vectorization(eng)
  hin = hin_vectorization(hin)
  return ({"encoder_inputs": eng, "decoder_inputs": hin[:, :-1],}, hin[:, 1:])


#eng ,hin = format_dataset(clean_data_eng ,clean_data_hin)

def make_dataset(pairs):
    eng_texts, hin_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    hin_texts = list(hin_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, hin_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

In [16]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [17]:
# for i, t in train_ds.take(1):
#   print(i)
#   print(t)
#   break

In [18]:
class TransformerEncoder(layers.Layer):
  def __init__(self,embed_dim , dense_dim, num_heads , **kwargs):
    super(TransformerEncoder ,self).__init__(**kwargs)

    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads =num_heads

    self.attention = layers.MultiHeadAttention(
        num_heads = num_heads , key_dim = embed_dim
    )

    self.dense_proj = keras.Sequential([
                                        layers.Dense(dense_dim , activation ="relu"),
                                        layers.Dense(embed_dim)
    ])

    self.layernorm1 = layers.LayerNormalization()
    self.layernorm2 = layers.LayerNormalization()
    self.support_masking = True

  def call(self , inputs ,mask=None):
    if mask is not None:
      padding_mask = tf.cast(mask[: ,tf.newaxis ,tf.newaxis,:] ,  dtype ="int32")
    attention_output = self.attention(
        query = inputs , value = inputs , key = inputs ,attention_mask=padding_mask
    )
    proj_inp = self.layernorm1(inputs + attention_output)

    proj_out = self.dense_proj(proj_inp)
    return self.layernorm2(proj_inp + proj_out)

In [19]:
class PositionalEmbedding(layers.Layer):
  def __init__(self,sequence_length ,  vocab_size , embed_dim , **kwargs):
    super(PositionalEmbedding,self).__init__(**kwargs)

    self.token_embedding = layers.Embedding(input_dim=vocab_size,
                                            output_dim = embed_dim)
    
    self.positional_embedding = layers.Embedding(input_dim=sequence_length,
                                            output_dim = embed_dim)
    
    self.sequence_length=sequence_length
    self.vocab_size = vocab_size
    self.embed_dim =embed_dim

  def call(self,inputs):
    length = tf.shape(inputs)[-1]

    position = tf.range(start=0,limit=length , delta=1)

    embed_tokens = self.token_embedding(inputs)
    embed_position = self.positional_embedding(position)

    return embed_tokens + embed_position

  def compute_mask(self,inputs , mask=None):
    return tf.math.not_equal(inputs,0)


In [20]:
class TransformerDecoder(layers.Layer):
  def __init__(self,embed_dim , latent_dim , num_heads , **kwargs):
    super(TransformerDecoder,self).__init__(**kwargs)

    self.embed_dim = embed_dim
    self.latent_dim = latent_dim
    self.num_heads = num_heads
    self.attention_1=keras.layers.MultiHeadAttention(
        num_heads=num_heads , key_dim  = embed_dim
    )
    self.attention_2=keras.layers.MultiHeadAttention(
        num_heads=num_heads , key_dim  = embed_dim
    )
    self.dense_proj = keras.Sequential(
        [
         layers.Dense(latent_dim ,activation='relu'),
         layers.Dense(embed_dim)
        ]
    )
    self.layernorm1 = layers.LayerNormalization()
    self.layernorm2 = layers.LayerNormalization()
    self.layernorm3 = layers.LayerNormalization()
    self.support_masking=True

  def call(self , inputs,encoder_outputs , mask=None):
    casual_mask = self.get_casual_attention_mask(inputs)

    if mask is not None:
      padding_mask = tf.cast(mask[:,tf.newaxis ,:] , dtype = "int32")
      padding_mask = tf.minimum(padding_mask ,casual_mask)

    attention_output_1 = self.attention_1(
        query = inputs,value=inputs,key=inputs , attention_mask = casual_mask
    )
    out_1 = self.layernorm1(inputs + attention_output_1)

    attention_output_2 = self.attention_2(
        query = out_1 , key = encoder_outputs ,value = encoder_outputs,
        attention_mask = padding_mask
    )

    out_2 = self.layernorm2(out_1 + attention_output_2)

    proj_out=self.dense_proj(out_2)
    return self.layernorm3(out_2 + proj_out)

  def get_casual_attention_mask(self,inputs):
    input_shape = tf.shape(inputs)

    batch_size , sequence_length = input_shape[0] ,input_shape[1]

    i = tf.range(sequence_length)[:,tf.newaxis]
    j= tf.range(sequence_length)

    mask = tf.cast(i>= j , dtype="int32")

    mask = tf.reshape(mask,(1,input_shape[1] ,input_shape[1]))
    mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
    return tf.tile(mask,mult)



In [21]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [22]:
epoch =1

transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   2565120     ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   3155456     ['positional_embedding[

In [26]:
transformer.compile(optimizer="adam" , loss = "sparse_categorical_crossentropy",
                    metrics=["accuracy"])

transformer.fit(train_ds, epochs=50, validation_data=val_ds)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

KeyboardInterrupt: ignored

In [27]:
hin_vocab = hin_vectorization.get_vocabulary()
hin_index_lookup = dict(zip(range(len(hin_vocab)), hin_vocab))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = hin_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = hin_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(30):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequence(input_sentence)
    print(input_sentence ,"----->" , translated)

so i used to embark on these imaginary journeys -----> [start] तो मैंने इन [UNK] [UNK] को [UNK] [UNK] end           
i was mesmerized -----> [start] मैं [UNK] था end                
“oh god im in such trouble ive got children to feed -----> [start] “ओह “अपने मैं तो कुछ तो साल के बच्चे को बच्चों को ही नहीं करता । end   
and i told you earlier -----> [start] और मैंने पहले कहा end               
but thats the media impression and thats like what you get -----> [start] लेकिन यह मीडिया बहुत ही [UNK] है और आप को यह कर सकते है end     
it was in the books it was inside the teachers head -----> [start] यह था [UNK] पर आक्रमण के समय में था end          
employing about people in africa -----> [start] अफ़्रीका में लोगों को end               
they are multinational as i say -----> [start] वे [UNK] हैं जो कहते हैं end             
agriculture grew at better than eight percent -----> [start] [UNK] से सबसे बड़ा प्रतिशत है end             
why did they create a system like that -----> [start] उन्होंने