In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, MultiHeadAttention, LayerNormalization, Dense, Dropout
from tensorflow.keras.models import Model
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset
df = pd.read_csv('/kaggle/input/pali-to-english/output_file.csv')

# Separate the input and target text
input_texts = df['input_text'].values
target_texts = df['target_text'].values

# Tokenization and padding (you can adjust max_length based on your data)
tokenizer_pali = tf.keras.preprocessing.text.Tokenizer()
tokenizer_english = tf.keras.preprocessing.text.Tokenizer()

tokenizer_pali.fit_on_texts(input_texts)
tokenizer_english.fit_on_texts(target_texts)

input_sequences = tokenizer_pali.texts_to_sequences(input_texts)
target_sequences = tokenizer_english.texts_to_sequences(target_texts)

max_len_input = max([len(seq) for seq in input_sequences])
max_len_target = max([len(seq) for seq in target_sequences])

input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_len_input, padding='post')
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen=max_len_target, padding='post')

# Vocabulary sizes
vocab_size_pali = len(tokenizer_pali.word_index) + 1
vocab_size_english = len(tokenizer_english.word_index) + 1

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(input_sequences, target_sequences[:, 1:], test_size=0.2, random_state=42)
target_sequences_input = target_sequences[:, :-1]


In [3]:
class Transformer(Model):
    def __init__(self, vocab_size_input, vocab_size_target, embed_dim, num_heads, ff_dim, num_layers, dropout_rate=0.1):
        super(Transformer, self).__init__()
        self.embedding_input = Embedding(vocab_size_input, embed_dim)
        self.embedding_target = Embedding(vocab_size_target, embed_dim)

        self.encoder_layers = [
            [MultiHeadAttention(num_heads, embed_dim // num_heads),
             LayerNormalization(epsilon=1e-6),
             Dense(ff_dim, activation='relu'),
             Dense(embed_dim),
             Dropout(dropout_rate)]
            for _ in range(num_layers)
        ]

        self.decoder_layers = [
            [MultiHeadAttention(num_heads, embed_dim // num_heads),
             MultiHeadAttention(num_heads, embed_dim // num_heads),
             LayerNormalization(epsilon=1e-6),
             Dense(ff_dim, activation='relu'),
             Dense(embed_dim),
             Dropout(dropout_rate)]
            for _ in range(num_layers)
        ]

        self.final_layer = Dense(vocab_size_target)

    def call(self, inputs, training=False):
        input_seq, target_seq = inputs
        input_embedding = self.embedding_input(input_seq)
        target_embedding = self.embedding_target(target_seq)

        x = input_embedding
        for mha, ln, ff1, ff2, drop in self.encoder_layers:
            attn_output = mha(x, x)
            x = ln(x + attn_output)
            ffn_output = ff2(ff1(x))
            x = drop(x + ffn_output, training=training)

        y = target_embedding
        for mha1, mha2, ln, ff1, ff2, drop in self.decoder_layers:
            attn_output1 = mha1(y, y)
            attn_output2 = mha2(attn_output1, x)
            y = ln(y + attn_output2)
            ffn_output = ff2(ff1(y))
            y = drop(y + ffn_output, training=training)

        return self.final_layer(y)


In [4]:
# Model parameters
embed_dim = 256  # Embedding size
num_heads = 8    # Number of attention heads
ff_dim = 512     # Feed-forward network size
num_layers = 4   # Number of encoder/decoder layers

transformer = Transformer(vocab_size_pali, vocab_size_english, embed_dim, num_heads, ff_dim, num_layers)

# Compile the model with accuracy metric
transformer.compile(optimizer='adam',
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])

# Train the model for 10 epochs with a batch size of 124
transformer.fit([X_train, target_sequences_input[:len(X_train)]],
                y_train,
                batch_size=16,  # or 32
                epochs=10)



Epoch 1/10


I0000 00:00:1728976866.833967      82 service.cc:145] XLA service 0x793ed4003e70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728976866.834020      82 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1728976866.834024      82 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
W0000 00:00:1728976868.619541      82 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert









I0000 00:00:1728976943.224366      82 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m6607/6608[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 618ms/step - accuracy: 0.9753 - loss: 0.3292
















[1m6608/6608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4265s[0m 631ms/step - accuracy: 0.9753 - loss: 0.3292
Epoch 2/10
[1m6608/6608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4033s[0m 610ms/step - accuracy: 0.9766 - loss: 0.2739
Epoch 3/10
[1m6608/6608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4015s[0m 608ms/step - accuracy: 0.9767 - loss: 0.2723
Epoch 4/10
[1m6608/6608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4012s[0m 607ms/step - accuracy: 0.9767 - loss: 0.2719
Epoch 5/10
[1m6608/6608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4012s[0m 607ms/step - accuracy: 0.9767 - loss: 0.2719
Epoch 6/10
[1m6608/6608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4011s[0m 607ms/step - accuracy: 0.9767 - loss: 0.2713
Epoch 7/10
[1m6608/6608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4010s[0m 607ms/step - accuracy: 0.9767 - loss: 0.2718
Epoch 8/10
[1m6608/6608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4009s[0m 607ms/step - accuracy: 0.9766 - loss: 0.

<keras.src.callbacks.history.History at 0x793fbefc5f60>

In [5]:
# Evaluate on the validation set
val_loss, val_accuracy = transformer.evaluate([X_val, target_sequences_input[len(X_train):]], y_val)

print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")


W0000 00:00:1729017211.134732      81 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert






[1m825/826[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 396ms/step - accuracy: 0.9768 - loss: 0.2701

W0000 00:00:1729017556.263868      79 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert




[1m826/826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 417ms/step - accuracy: 0.9768 - loss: 0.2701
Validation Loss: 0.27197015285491943
Validation Accuracy: 0.9765858054161072
