In [2]:
import re
import sentencepiece as spm

In [3]:
# Train SentencePiece tokenizer
spm.SentencePieceTrainer.train(
    input='/kaggle/input/20000data/20000data.txt', 
    model_prefix='malayalam_bpe', 
    vocab_size=5000, 
    model_type='bpe',  # Byte Pair Encoding
    character_coverage=1.0
)

# Load the trained tokenizer
sp = spm.SentencePieceProcessor(model_file='malayalam_bpe.model')

# Example tokenization
text = "മലയാളം നമുക്ക് അറിയാം"
tokens = sp.encode(text, out_type=str)
print(tokens)

['▁മലയാളം', '▁ന', 'മു', 'ക്ക്', '▁അറിയ', 'ാം']


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class MalayalamDataset(Dataset):
    def __init__(self, file_path, tokenizer, context_length=256):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.text = f.read()
        self.tokenizer = tokenizer
        self.tokens = self.tokenizer.encode(self.text, out_type=int)
        self.context_length = context_length

    def __len__(self):
        return len(self.tokens) - self.context_length

    def __getitem__(self, idx):
        input_ids = self.tokens[idx : idx + self.context_length]
        target_ids = self.tokens[idx + 1 : idx + self.context_length + 1]
        return torch.tensor(input_ids), torch.tensor(target_ids)

# Load tokenizer and dataset
sp = spm.SentencePieceProcessor(model_file='malayalam_bpe.model')
dataset = MalayalamDataset('/kaggle/input/20000data/20000data.txt', sp)

In [5]:

# DataLoader for batching
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

# Test sample
for x, y in train_loader:
    print(x.shape, y.shape)
    break

import torch.nn as nn

class MiniGPT(nn.Module):
    def __init__(self, vocab_size=5000, n_embd=256, n_layer=1, n_head=4, context_length=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_embd)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=n_embd, nhead=n_head, dim_feedforward=4 * n_embd),
            num_layers=n_layer
        )
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        return self.lm_head(x)

# Initialize model
model = MiniGPT()
print(sum(p.numel() for p in model.parameters()), "Total Parameters")

torch.Size([8, 256]) torch.Size([8, 256])
3354760 Total Parameters




In [6]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MiniGPT().to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Training Loop
for epoch in range(3):  # Adjust epochs
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 4.5313
Epoch 2, Loss: 4.4315
Epoch 3, Loss: 4.4553


In [7]:
import torch.nn.functional as F

def generate_text(model, tokenizer, start_text="മലയാളം", max_len=100):
    model.eval()
    input_ids = torch.tensor(tokenizer.encode(start_text, out_type=int)).unsqueeze(0).to(device)
    
    for _ in range(max_len):
        with torch.no_grad():
            logits = model(input_ids)
        next_token = torch.argmax(F.softmax(logits[:, -1, :], dim=-1), dim=-1)
        input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
    
    return tokenizer.decode(input_ids.squeeze(0).tolist())

print(generate_text(model, sp, "മലയാളം", max_len=50))

മലയാളം തമിഴ് നാട്ടിൽ നിന്നും വ്യത്യസ്തമായിരിക്കും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശങ്ങളും ഈ പ്രദേശ
