🧩 STEP 1: Install & Import Libraries

In [1]:
!pip install -q tensorflow
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from google.colab import files


📁 STEP 2: Upload Dataset

In [2]:
uploaded = files.upload()


Saving sandhi_splitting_dataset.txt to sandhi_splitting_dataset.txt


🧹 STEP 3: Load and Process Dataset

In [3]:
input_texts = []
target_texts = []

with open('sandhi_splitting_dataset.txt', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 3:
            compound, part1, part2 = parts
            input_texts.append(compound)
            target_texts.append(part1 + '+' + part2)


🔡 STEP 4: Vocabulary and Encoding Setup

In [4]:
def get_vocab(texts):
    vocab = set()
    for txt in texts:
        for ch in txt:
            vocab.add(ch)
    return sorted(vocab)

special_tokens = ['<pad>', '<start>', '<end>']
inp_vocab = special_tokens + get_vocab(input_texts)
tgt_vocab = special_tokens + get_vocab(target_texts)

inp_char2idx = {u: i for i, u in enumerate(inp_vocab)}
inp_idx2char = {i: u for i, u in enumerate(inp_vocab)}
tgt_char2idx = {u: i for i, u in enumerate(tgt_vocab)}
tgt_idx2char = {i: u for i, u in enumerate(tgt_vocab)}


📏 STEP 5: Encode Texts & Pad Sequences

In [5]:
max_length = 40
max_target_length = 50

def encode_text(text, char2idx, add_tokens=False):
    seq = [char2idx[c] for c in text if c in char2idx]
    if add_tokens:
        seq = [char2idx['<start>']] + seq + [char2idx['<end>']]
    return seq

input_tensor = tf.keras.preprocessing.sequence.pad_sequences(
    [encode_text(txt, inp_char2idx) for txt in input_texts],
    maxlen=max_length, padding='post'
)

target_tensor = tf.keras.preprocessing.sequence.pad_sequences(
    [encode_text(txt, tgt_char2idx, add_tokens=True) for txt in target_texts],
    maxlen=max_target_length, padding='post'
)


🧪 STEP 6: Prepare Dataset for Training

In [6]:
BATCH_SIZE = 64
BUFFER_SIZE = len(input_tensor)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)


🧠 STEP 7: Define Transformer Architecture

In [7]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model):
        super().__init__()
        self.pos_encoding = self._positional_encoding(position, d_model)

    def _positional_encoding(self, position, d_model):
        angle_rads = np.arange(position)[:, np.newaxis] / np.power(
            10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model)
        )
        sines = np.sin(angle_rads[:, 0::2])
        cosines = np.cos(angle_rads[:, 1::2])
        pos_encoding = np.concatenate([sines, cosines], axis=-1)
        return tf.cast(pos_encoding[np.newaxis, ...], dtype=tf.float32)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]


In [8]:
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, LayerNormalization, Input

def build_transformer(vocab_size_inp, vocab_size_tar, d_model=128, num_heads=2, ff_dim=512):
    inp = Input(shape=(None,))
    tar = Input(shape=(None,))

    inp_emb = Embedding(vocab_size_inp, d_model)(inp)
    inp_emb = PositionalEncoding(1000, d_model)(inp_emb)

    tar_emb = Embedding(vocab_size_tar, d_model)(tar)
    tar_emb = PositionalEncoding(1000, d_model)(tar_emb)

    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(tar_emb, inp_emb, inp_emb)
    attn_output = LayerNormalization()(attn_output + tar_emb)

    ff_output = Dense(ff_dim, activation='relu')(attn_output)
    ff_output = Dense(d_model)(ff_output)
    ff_output = LayerNormalization()(ff_output + attn_output)

    outputs = Dense(vocab_size_tar, activation='softmax')(ff_output)

    model = tf.keras.Model(inputs=[inp, tar], outputs=outputs)
    return model


🧑‍🏫 STEP 8: Compile and Train the Model

In [9]:
model = build_transformer(len(inp_char2idx), len(tgt_char2idx))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

def prepare_batch(inp, tar):
    decoder_input = tar[:, :-1]
    decoder_target = tar[:, 1:]
    return (inp, decoder_input), decoder_target

model.fit(dataset.map(prepare_batch), epochs=20)


Epoch 1/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 6ms/step - loss: 0.2298
Epoch 2/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 5ms/step - loss: 0.0447
Epoch 3/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5ms/step - loss: 0.0375
Epoch 4/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - loss: 0.0336
Epoch 5/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - loss: 0.0311
Epoch 6/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5ms/step - loss: 0.0297
Epoch 7/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - loss: 0.0283
Epoch 8/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - loss: 0.0271
Epoch 9/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5ms/step - loss: 0.0263
Epoch 10/10
[1m3720/3720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x78a0d5e56990>

🔍 STEP 9: Define Inference Function

In [10]:
def predict_split(word):
    word = word.strip().replace('\t', '').replace('\n', '')
    encoded_input = encode_text(word, inp_char2idx)

    encoder_input = tf.keras.preprocessing.sequence.pad_sequences(
        [encoded_input], maxlen=max_length, padding='post'
    )
    encoder_input = tf.convert_to_tensor(encoder_input)

    decoder_input = tf.expand_dims([tgt_char2idx['<start>']], 0)
    result = []

    for _ in range(max_target_length):
        predictions = model([encoder_input, decoder_input])
        predicted_id = tf.argmax(predictions[0, -1, :]).numpy()
        predicted_char = tgt_idx2char[predicted_id]
        if predicted_char == '<end>':
            break
        result.append(predicted_char)
        decoder_input = tf.concat([decoder_input, [[predicted_id]]], axis=-1)

    return ''.join(result)


🧪 STEP 10: Evaluate Accuracy on Validation Data

In [11]:
from sklearn.model_selection import train_test_split

train_input, val_input, train_target, val_target = train_test_split(
    input_tensor, target_tensor, test_size=0.1, random_state=42
)

def evaluate_accuracy(n=500):
    correct = 0
    for i in range(n):
        inp_chars = [inp_idx2char[idx] for idx in val_input[i] if idx > 2]
        expected = ''.join([tgt_idx2char[idx] for idx in val_target[i] if idx > 2]).replace('<start>', '').replace('<end>', '')
        predicted = predict_split(''.join(inp_chars))
        if predicted == expected:
            correct += 1
    print(f"✅ Accuracy on {n} samples: {correct / n:.2%}")

evaluate_accuracy()


✅ Accuracy on 500 samples: 63.40%


💾 STEP 11: Save the Model

In [12]:
model.save('/content/malayalam_sandhi_transformer.h5')




🖥️ STEP 12: Gradio UI

In [23]:
!pip install gradio --quiet


In [30]:
gr.Interface(
    fn=predict_split,
    inputs=gr.Textbox(label="🔤 Malayalam Compound Word"),
    outputs=gr.Textbox(label="🔍 Predicted Sandhi Split"),
    title="🌺 Malayalam Sandhi Splitter",
    description="Enter a compound Malayalam word to see its Sandhi split using a Transformer model.",
).launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f7d022aef13b07fe02.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


