In [1]:
from pathlib import Path
import random


Path → safe file handling   
random → data inspection

In [2]:
DATA_RAW = Path("data/raw")     #original untouched data(read-only)
DATA_PROCESSED = Path("data/processed")     #cleaned + merged data will go(model reads from here)
TOKENIZER_DIR = Path("tokenizer")       #SentencePiece models will be saved here

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)       #Creates folders only if they don’t exist
TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)


In [3]:
# BPCC
bpcc_en = (DATA_RAW / "bpcc/train.en").read_text(encoding="utf-8").splitlines()
bpcc_sa = (DATA_RAW / "bpcc/train.sa").read_text(encoding="utf-8").splitlines()

# Samayik
sam_train_en = (DATA_RAW / "samayik/train.en").read_text(encoding="utf-8").splitlines()
sam_train_sa = (DATA_RAW / "samayik/train.sa").read_text(encoding="utf-8").splitlines()

sam_dev_en = (DATA_RAW / "samayik/dev.en").read_text(encoding="utf-8").splitlines()
sam_dev_sa = (DATA_RAW / "samayik/dev.sa").read_text(encoding="utf-8").splitlines()

sam_test_en = (DATA_RAW / "samayik/test.en").read_text(encoding="utf-8").splitlines()
sam_test_sa = (DATA_RAW / "samayik/test.sa").read_text(encoding="utf-8").splitlines()

# Load Itihasa (parallel, aligned)
iti_train_en = (DATA_RAW / "itihasa/train.en").read_text(encoding="utf-8").splitlines()
iti_train_sa = (DATA_RAW / "itihasa/train.sn").read_text(encoding="utf-8").splitlines()

iti_dev_en = (DATA_RAW / "itihasa/dev.en").read_text(encoding="utf-8").splitlines()
iti_dev_sa = (DATA_RAW / "itihasa/dev.sn").read_text(encoding="utf-8").splitlines()

iti_test_en = (DATA_RAW / "itihasa/test.en").read_text(encoding="utf-8").splitlines()
iti_test_sa = (DATA_RAW / "itihasa/test.sn").read_text(encoding="utf-8").splitlines()



Each file is read as UTF-8 (mandatory for Devanagari), split by newline, becomes List[str]

In [4]:
print("BPCC:", len(bpcc_en), len(bpcc_sa))

print("Samayik train:", len(sam_train_en), len(sam_train_sa))
print("Samayik dev:", len(sam_dev_en), len(sam_dev_sa))
print("Samayik test:", len(sam_test_en), len(sam_test_sa))

print("Itihasa train:", len(iti_train_en), len(iti_train_sa))
print("Itihasa dev:", len(iti_dev_en), len(iti_dev_sa))
print("Itihasa test:", len(iti_test_en), len(iti_test_sa))



BPCC: 98788 99424
Samayik train: 43493 43493
Samayik dev: 2416 2416
Samayik test: 2417 2417
Itihasa train: 75161 75161
Itihasa dev: 6148 6148
Itihasa test: 11721 11721


In [5]:
print(bpcc_en[0])
print(bpcc_sa[0]) 

There was no Mughal tradition of primogeniture, the systematic passing of rule, upon an emperor's death, to his eldest son.
चक्रवर्तिनः मृत्योः अनन्तरं तस्य शासनस्य व्यवस्थितरूपेण सङ्क्रमणस्य, मुघलपरम्परायाः ज्येष्ठपुत्राधिकारपद्धतिः नासीत्।


BPCC data is mismatched and not aligned, we'll use it for tokenizer training, not for model training.

In [6]:
i = random.randint(0, len(iti_train_en) - 1)
print("EN:", iti_train_en[i])
print("SA:", iti_train_sa[i])


EN: O Rishi, then they obtain high fortune and happiness. If one however cannot acquire knowledge, he takes an inferior birth. The fruits of acts performed in this world are reaped in the next. O Brahmana, this worid has been declared to be one of acts.
SA: तत्रापि स महाभागः सुखभागभिजायते। न चेत् सम्बुध्यते तत्र गच्छत्यधमतां ततः॥ इह यत् क्रियते कर्म तत् परत्रोपभुज्यते। कर्मभूमिरियं ब्रह्मन् फलभूमिरसौ मता॥


In [7]:
i = random.randint(0, len(sam_train_en) - 1)
print("EN:", sam_train_en[i])
print("SA:", sam_train_sa[i])

EN: """For if thou wert cut out of the olive tree which is wild by nature, and wert graffed contrary to nature into a good olive tree: how much more shall these, which be the natural branches, be graffed into their own olive tree?"""
SA: वन्यजितवृक्षस्य शाखा सन् त्वं यदि ततश्छिन्नो रीतिव्यत्ययेनोत्तमजितवृक्षे रोेेपितोऽभवस्तर्हि तस्य वृक्षस्य स्वीया याः शाखास्ताः किं पुनः स्ववृक्षे संलगितुं न शक्नुवन्ति?


In [8]:
# Translation training corpus
train_en = sam_train_en + iti_train_en
train_sa = sam_train_sa + iti_train_sa

print("Final TRAIN size:", len(train_en))


Final TRAIN size: 118654


In [9]:
# Tokenizer corpus 
tok_en = bpcc_en + train_en
tok_sa = bpcc_sa + train_sa

print("Tokenizer corpus:")
print("EN sentences:", len(tok_en))
print("SA sentences:", len(tok_sa))


Tokenizer corpus:
EN sentences: 217442
SA sentences: 218078


In [10]:
en_lengths = [len(s.split()) for s in train_en if s.strip()]

print("EN sentences:", len(en_lengths))
print("EN min:", min(en_lengths))
print("EN max:", max(en_lengths))
print("EN avg:", sum(en_lengths) / len(en_lengths))


EN sentences: 118654
EN min: 1
EN max: 1306
EN avg: 24.342188211101185


In [11]:
sa_lengths = [len(s) for s in train_sa if s.strip()]

print("SA sentences:", len(sa_lengths))
print("SA min:", min(sa_lengths))
print("SA max:", max(sa_lengths))
print("SA avg:", sum(sa_lengths) / len(sa_lengths))


SA sentences: 118654
SA min: 1
SA max: 2688
SA avg: 87.45049471572808


In [12]:
(DATA_PROCESSED / "tok_en.txt").write_text(
    "\n".join(tok_en),
    encoding="utf-8"
)


25313780

In [13]:
(DATA_PROCESSED / "tok_sa.txt").write_text(
    "\n".join(tok_sa),
    encoding="utf-8"
)


19646646

In [14]:
# Train English tokenizer ONLY if it doesn't exist
if not (TOKENIZER_DIR / "spm_en.model").exists():
    import sentencepiece as spm
    spm.SentencePieceTrainer.train(
        input=str(DATA_PROCESSED / "tok_en.txt"),
        model_prefix=str(TOKENIZER_DIR / "spm_en"),
        vocab_size=16000,
        model_type="unigram",
        character_coverage=1.0,
        pad_id=0,
        bos_id=1,
        eos_id=2,
        unk_id=3
    )
else:
    print("English tokenizer already exists. Skipping training.")


English tokenizer already exists. Skipping training.


In [15]:
# Train Sanskrit tokenizer ONLY if it doesn't exist
if not (TOKENIZER_DIR / "spm_sa.model").exists():
    import sentencepiece as spm
    spm.SentencePieceTrainer.train(
        input=str(DATA_PROCESSED / "tok_sa.txt"),
        model_prefix=str(TOKENIZER_DIR / "spm_sa"),
        vocab_size=32000,
        model_type="unigram",
        character_coverage=1.0,
        pad_id=0,
        bos_id=1,
        eos_id=2,
        unk_id=3
    )
else:
    print("Sanskrit tokenizer already exists. Skipping training.")


Sanskrit tokenizer already exists. Skipping training.


TOKENIZER (RUN ONCE ONLY)
DO NOT RETRAIN UNLESS DATA CHANGES

In [16]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


CUDA available: True
GPU name: NVIDIA GeForce RTX 2050


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")