In [None]:
import pandas as pd
import numpy as np
import random

In [None]:
df_full = pd.read_excel('/content/drive/MyDrive/Datasets/AI Project/language_data.xlsx')
df_full.head()

Unnamed: 0,english,telugu
0,Hello,హలో
1,Are you good?,మీరు బాగున్నారా?
2,I am happy,నేను సంతోషంగా ఉన్నాను
3,how are you?,మీరు ఎలా ఉన్నారు?
4,I am good,నేను భాగున్నాను


In [None]:
df_full['word_count_en'] = df_full['english'].str.split().str.len()
df_full['word_count_te'] = df_full['telugu'].str.split().str.len()
df = df_full[(df_full['word_count_en'] <= 20) & (df_full['word_count_te'] <= 20)]
df = df.drop(columns=['word_count_en', 'word_count_te'])

In [None]:
df.head()

Unnamed: 0,english,telugu
0,Hello,హలో
1,Are you good?,మీరు బాగున్నారా?
2,I am happy,నేను సంతోషంగా ఉన్నాను
3,how are you?,మీరు ఎలా ఉన్నారు?
4,I am good,నేను భాగున్నాను


In [None]:
en_vals = df['telugu'].values
max_len = 0
word = ""
for val in en_vals:
  val_len = len(val.split())
  if val_len > max_len:
    max_len = val_len
    word = val

print(max_len)
print(word)

20
బాక్టీరియల్ జీవితం కొన్నిసార్లు వృక్షజాలం, [71] [72] లో చేర్చబడుతుంది మరియు కొన్ని వర్గీకరణలు మొక్కల వృక్షజాలం నుండి విడిగా బ్యాక్టీరియా వృక్షజాలం అనే పదాన్ని ఉపయోగిస్తాయి.


In [None]:
text_pairs = []
for index, row in df.iterrows():
  english, telugu = row['english'], row['telugu']
  # print(english, telugu) 
  telugu = "[start] " + telugu + " [end]"
  text_pairs.append((english, telugu))

In [None]:
len(text_pairs)

9331

In [None]:
print(random.choice(text_pairs))

random.shuffle(text_pairs)

num_train_samples = int(0.7 * len(text_pairs))
num_val_samples = int(0.15 * len(text_pairs))
print(num_train_samples, num_val_samples)
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

('Joe made the sugar cookies; Susan decorated them.', '[start] జో చక్కెర కుకీలను తయారు చేశాడు; సుసాన్ వారిని అలంకరించాడు. [end]')
6531 1399


In [None]:
import tensorflow as tf
import string
import re
from tensorflow.keras import layers

In [None]:
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
strip_chars

'!"#$%&\'()*+,-./:;<=>?@\\^_`{|}~'

In [None]:

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=custom_standardization,
)
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_english_texts = [pair[0] for pair in train_pairs]
train_telugu_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_telugu_texts)

In [None]:
print(len(source_vectorization.get_vocabulary()))
print(len(target_vectorization.get_vocabulary()))

7412
10802


In [None]:
print(source_vectorization.get_vocabulary()[:15])
print(target_vectorization.get_vocabulary()[:10])

['', '[UNK]', 'the', 'a', 'and', 'to', 'of', 'i', 'in', 'he', 'my', 'with', 'was', 'she', 'on']
['', '[UNK]', '[start]', '[end]', 'మరియు', 'నేను', 'అతను', 'ఆమె', 'నా', 'ఒక']


In [None]:
samp = random.choice(train_english_texts)
print(samp)
source_vectorization(samp)

He is good at eating pickles and telling women about his emotional problems.


<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([   9,   15,   93,   22,  123, 1288,    4,  350, 1095,   30,   19,
        428,  514,    0,    0,    0,    0,    0,    0,    0])>

In [None]:
# Preparing dataset
batch_size = 64

def format_dataset(eng, tel):
    eng = source_vectorization(eng)
    tel = target_vectorization(tel)
    return ({
        "english": eng,
        "telugu": tel[:, :-1],
    }, tel[:, 1:])

def make_dataset(pairs):
    eng_texts, tel_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    tel_texts = list(tel_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, tel_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()


In [None]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [None]:
for inputs, targets in train_ds.take(1):
    print(f"english shape: {inputs['english'].shape}")
    print(f"telugu shape: {inputs['telugu'].shape}")
    print(f"target shape: {targets.shape}")

english shape: (64, 20)
telugu shape: (64, 20)
target shape: (64, 20)


In [None]:
print(inputs['english'][0])
print(inputs['telugu'][0])
print(targets[0])

tf.Tensor(
[  7 260  10 234   8   2  87   4 114   2 393  14  10 220   0   0   0   0
   0   0], shape=(20,), dtype=int64)
tf.Tensor(
[   2    5    8  343  177 1048    4    8  321   47  102 5878    3    0
    0    0    0    0    0    0], shape=(20,), dtype=int64)
tf.Tensor(
[   5    8  343  177 1048    4    8  321   47  102 5878    3    0    0
    0    0    0    0    0    0], shape=(20,), dtype=int64)


In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_layers = tf.keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_layers(proj_input)
        return self.layernorm_2(proj_input + proj_output)

In [None]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_layers = tf.keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def masked_attention(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.masked_attention(inputs)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_layers(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

In [None]:
embed_dim = 512
dense_dim = 2048
num_heads = 6

encoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = tf.keras.Input(shape=(None,), dtype="int64", name="telugu")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
transformer = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
transformer.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 english (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 telugu (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   3845120     ['english[0][0]']                
 alEmbedding)                                                                                     
                                                                                                  
 positional_embedding_1 (Positi  (None, None, 256)   3845120     ['telugu[0][0]']             

In [None]:
transformer.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
transformer.fit(train_ds, epochs=30, validation_data=val_ds)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f8e100dcd00>

In [None]:
tel_vocab = target_vectorization.get_vocabulary()
tel_index_lookup = dict(zip(range(len(tel_vocab)), tel_vocab))
max_decoded_sentence_length = sequence_length

def decode_sequence(input_sentence, transformer_model):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization(
            [decoded_sentence])[:, :-1]
        predictions = transformer_model(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = tel_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
test_tel_texts = [pair[1] for pair in test_pairs]


In [None]:
actual = []
predictions = []
for i in range(1, 5):
    # input_sentence = random.choice(test_eng_texts)
    r = random.randint(0, len(test_eng_texts))
    input_sentence = test_eng_texts[r]
    actual_sentence = test_tel_texts[r]
    actual.append(actual_sentence.split()[1:-1])
    print("-")
    print("Input:", input_sentence)
    predicted_sentence = decode_sequence(input_sentence, transformer)
    predictions.append(predicted_sentence.split()[1:-1])
    print("Prediction:", predicted_sentence)
    print("Actual:", actual_sentence)

-
Input: They went on a hike in the mountains.
Prediction: [start] వారు ప్రకృతిని ఆస్వాదించడానికి పర్వతాలలో పాదయాత్రకు వెళుతున్నాను [end]
Actual: [start] వారు పర్వతాలలో పాదయాత్రకు వెళ్ళారు. [end]
-
Input: She did her best to help him.
Prediction: [start] అతనికి సహాయం చేయడానికి ఆమె తన వంతు కృషి చేసింది [end]
Actual: [start] అతనికి సహాయం చేయడానికి ఆమె తన వంతు కృషి చేసింది. [end]
-
Input: The mysterious diary records the voice.
Prediction: [start] మర్మమైన డైరీ వాయిస్ రికార్డ్ చేస్తుంది [end]
Actual: [start] మర్మమైన డైరీ వాయిస్ రికార్డ్ చేస్తుంది. [end]
-
Input: He felt a sense of accomplishment after completing a challenging project.
Prediction: [start] అతను సవాలు చేసే వ్యాయామం పూర్తి చేయడంతో అతను సాఫల్య భావాన్ని అనుభవించాడు [end]
Actual: [start] అతను సవాలు చేసే ప్రాజెక్టును పూర్తి చేసిన తర్వాత సాధించిన భావాన్ని అనుభవించాడు. [end]


In [None]:
print(len([actual[0]]))
print(len(predictions[0]))

1
9


In [None]:
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
score = sentence_bleu([actual[0]], predictions[0])
print(score)

8.38826642100846e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
text_input = "Hello, how are you today?"
predicted_output = decode_sequence(text_input, transformer)
print(predicted_output)

[start] హలో మీరు ఈ రోజు ఎలా [end]
