In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import pandas as pd
import string
from string import digits
import re
import numpy as np


In [2]:
# Using positional encoding to re-inject order information

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [3]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim    # Dimension of embedding
        self.dense_dim = dense_dim    # No. of neurons in dense layer
        self.num_heads = num_heads    # No. of heads for MultiHead Attention layer
        self.attention = layers.MultiHeadAttention(   # MultiHead Attention layer
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]    # encoders are stacked on top of the other.
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    # Call function based on figure above
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)

        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)  # LayerNormalization + Residual connection

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [4]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        # Now we have 2 MultiHead Attention layers - one for self attention and one for cross attention
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1])) # sequence_length == input_shape[1]
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
              tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask_=None): # two inputs: decoder i/p and encoder o/p
        causal_mask = self.get_causal_attention_mask(inputs)
        attention_output_1 = self.attention_1(    # Q: What kind of attention?
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs
        )

        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [5]:
embed_dim = 512
dense_dim = 2048
num_heads = 8
vocab_size = 20000
sequence_length = 30
batch_size = 64

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="hindi")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs,mask_=None)

x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
transformer.summary()

2023-12-04 10:21:51.333379: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 hindi (InputLayer)          [(None, None)]               0         []                            
                                                                                                  
 positional_embedding (Posi  (None, None, 512)            1025536   ['english[0][0]']             
 tionalEmbedding)                                         0                                       
                                                                                                  
 positional_embedding_1 (Po  (None, None, 512)            1025536   ['hindi[0][0]']           

## Preparing the data

In [36]:
df = pd.read_csv("Hindi_English_Truncated_Corpus.csv",encoding='utf-8')
df.dropna(inplace=True)

In [37]:
def clean_dataset_rows(df):
    df['english_sentence'] = df['english_sentence'].apply(lambda x: x.lower())
    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.lower())

    exclude = set(string.punctuation) # Set of all special characters
    df['english_sentence'] = df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

    remove_digits = str.maketrans('', '', digits)
    df['english_sentence'] = df['english_sentence'].apply(lambda x: x.translate(remove_digits))
    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.translate(remove_digits))
    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

    # Remove extra hindices
    df['english_sentence'] = df['english_sentence'].apply(lambda x: x.strip())
    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: x.strip())
    
    df['length_eng_sentence']=df['english_sentence'].apply(lambda x:len(x.split(" ")))
    df['length_hin_sentence']=df['hindi_sentence'].apply(lambda x:len(x.split(" ")))
    return df

In [38]:
df = clean_dataset_rows(df)

In [39]:
df.shape

(127605, 5)

In [40]:
df.head(2)

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,ted,politicians do not have permission to do what ...,राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करन...,12,13
1,ted,id like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,9,11


In [41]:
df['length_hin_sentence'].max()

418

In [42]:
df['length_eng_sentence'].max()

398

In [46]:
x = df[df['source']=='indic2012']
x['length_eng_sentence'].max()

398

In [47]:
x['length_hin_sentence'].max()

418

In [16]:
# pre-processing. Separating input and output sequences

text_pairs = []
for _, row in df.iterrows():
    english = row['english_sentence']
    hindi = row['hindi_sentence']
    hindi = "[start] " + hindi + " [end]"
    text_pairs.append((english, hindi))

import random
print(random.choice(text_pairs))
print(f"no. of pairs: {len(text_pairs)}")

('all powered on that coal', '[start] साब कोयले की ताकत पे खड़े हुए [end]')
no. of pairs: 39881


In [17]:
#splitting data
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

In [18]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [19]:
# Vectorizing the English and Hindi text pairs

strip_chars = string.punctuation 
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

# Custom standardization function for hindi
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(    # Replace elements of input matching regex pattern with rewrite.
        lowercase, f"[{re.escape(strip_chars)}]", "")

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_english_texts = [pair[0] for pair in train_pairs]
train_hindi_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_hindi_texts)


In [21]:
def format_dataset(eng, hindi):
    eng = source_vectorization(eng)
    hindi = target_vectorization(hindi)
    return ({
        "english": eng,           # encoder input
        "hindi": hindi[:, :-1],    # decoder input Q: what is the first axis?
    }, hindi[:, 1:])                  # decoder ouput


def make_dataset(pairs):
    eng_texts, hindi_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    hindi_texts = list(hindi_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, hindi_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache() #Use in-memory caching to speed up preprocessing.


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [22]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['hindi'].shape: {inputs['hindi'].shape}")
    print(f"targets.shape: {targets.shape}")
    print(targets[3])
    print(inputs['english'][3])

inputs['english'].shape: (64, 30)
inputs['hindi'].shape: (64, 30)
targets.shape: (64, 30)
tf.Tensor(
[    5    21   144  3615 14815     3     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0], shape=(30,), dtype=int64)
tf.Tensor(
[  95    9   91    6 4171    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0], shape=(30,), dtype=int64)


## Traning and evaluating the model

In [None]:
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])

transformer.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7feea4aa2290>

### Inference :

In [25]:
hindi_vocab = target_vectorization.get_vocabulary()
hindi_index_lookup = dict(zip(range(len(hindi_vocab)), hindi_vocab))
max_decoded_sentence_length = 30

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization(
            [decoded_sentence])[:, :-1]
        predictions = transformer(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = hindi_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(4):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
only after meeting your physiological needs and other needs
[start] ही जो हमारे दूसरे होने के बाद और अपनी जरूरत थी [end]
-
laughter
[start] हँसी [end]
-
but this shows you that the government of uganda
[start] लेकिन यह पता चलता है कि सरकार के और सरकार के [end]
-
of these two enormous disciplines that humanity has
[start] दो बहुत सारे आम तौर पर इन दोनों को [end]
