In [1]:

import os, sys
from pathlib import Path
import platform
import torch

# Safer memory allocation on Windows (reduce fragmentation)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Project root — save to Desktop/HindiToEnglishMT automatically if exists
home = Path.home()
desktop = home / "Desktop"
base_dir = desktop / "HindiToEnglishMT"
if not base_dir.exists():
    # fallback to current working dir
    base_dir = Path.cwd().resolve().parent
print("Base dir:", base_dir)

sys.path.append(str(base_dir / "utils"))
from dataset_utils import ensure_dirs, load_iitb, ds_to_pairs, clean_pairs, save_pairs_to_csv, split_train_val_test

ensure_dirs(base_dir)


Using device: cuda
GPU: NVIDIA GeForce RTX 3050 Laptop GPU
Base dir: C:\Users\ashwi\OneDrive\Desktop\HindiToEnglishMT


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Load IITB dataset (requires internet the first time)
from datasets import load_dataset
ds = load_dataset("cfilt/iitb-english-hindi")
print(ds)

train_pairs = ds_to_pairs(ds["train"])
val_pairs = ds_to_pairs(ds["validation"])
test_pairs = ds_to_pairs(ds["test"])

print("Raw counts:", len(train_pairs), len(val_pairs), len(test_pairs))

# Clean and combine
train_pairs = clean_pairs(train_pairs)
val_pairs = clean_pairs(val_pairs)
test_pairs = clean_pairs(test_pairs)

print("Cleaned counts:", len(train_pairs), len(val_pairs), len(test_pairs))

# (Optional) merge train+val for tokenizer training; keep separate for validation
all_for_spm = train_pairs + val_pairs + test_pairs


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating train split: 100%|██████████| 1659083/1659083 [00:02<00:00, 791737.55 examples/s] 
Generating validation split: 100%|██████████| 520/520 [00:00<00:00, 76225.42 examples/s]
Generating test split: 100%|██████████| 2507/2507 [00:00<00:00, 201493.12 examples/s]


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})
Raw counts: 1659083 520 2507
Cleaned counts: 1603359 520 2506


In [4]:

# Save cleaned CSVs
import pandas as pd
processed_dir = base_dir / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

save_pairs_to_csv(train_pairs, processed_dir / "train.csv")
save_pairs_to_csv(val_pairs, processed_dir / "val.csv")
save_pairs_to_csv(test_pairs, processed_dir / "test.csv")

pd.read_csv(processed_dir / "train.csv").head()


Unnamed: 0,hi,en
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [5]:

# Train a joint SentencePiece model (Unigram or BPE)
import sentencepiece as spm
from pathlib import Path

vocab_dir = base_dir / "models" / "vocab"
vocab_dir.mkdir(parents=True, exist_ok=True)

spm_input_path = processed_dir / "spm_corpus.txt"
with open(spm_input_path, "w", encoding="utf-8") as f:
    for hi, en in all_for_spm:
        f.write(hi.strip() + "\n")
        f.write(en.strip() + "\n")

vocab_size = 32000
model_prefix = str(vocab_dir / "hi_en_unigram")
spm.SentencePieceTrainer.Train(
    input=str(spm_input_path),
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    character_coverage=1.0,
    model_type="unigram",
    input_sentence_size=2000000,
    shuffle_input_sentence=True,
    bos_id=1, eos_id=2, pad_id=0, unk_id=3
)

print("Trained SentencePiece saved to:", model_prefix + ".model")


Trained SentencePiece saved to: C:\Users\ashwi\OneDrive\Desktop\HindiToEnglishMT\models\vocab\hi_en_unigram.model


In [6]:

# Encode datasets using the trained SentencePiece model
import sentencepiece as spm
import pandas as pd
import numpy as np

sp = spm.SentencePieceProcessor(model_file=str(vocab_dir / "hi_en_unigram.model"))
pad_id = sp.pad_id()
bos_id = sp.bos_id()
eos_id = sp.eos_id()
unk_id = sp.unk_id()
print("Token IDs — PAD:", pad_id, "BOS:", bos_id, "EOS:", eos_id, "UNK:", unk_id)

def encode_pair(hi, en):
    # For source (Hindi) we add BOS at start and EOS at end
    src = [bos_id] + sp.encode(hi, out_type=int) + [eos_id]
    # For target (English) we add BOS/EOS as well
    tgt = [bos_id] + sp.encode(en, out_type=int) + [eos_id]
    return src, tgt

def encode_split(pairs):
    src_ids = []
    tgt_ids = []
    for hi, en in pairs:
        s, t = encode_pair(hi, en)
        src_ids.append(s)
        tgt_ids.append(t)
    return src_ids, tgt_ids

train_src, train_tgt = encode_split(train_pairs)
val_src, val_tgt = encode_split(val_pairs)
test_src, test_tgt = encode_split(test_pairs)

import json
def save_jsonl(path, src_ids, tgt_ids):
    with open(path, "w", encoding="utf-8") as f:
        for s, t in zip(src_ids, tgt_ids):
            f.write(json.dumps({"src": s, "tgt": t}, ensure_ascii=False) + "\n")

save_jsonl(processed_dir / "train_tokenized.jsonl", train_src, train_tgt)
save_jsonl(processed_dir / "val_tokenized.jsonl", val_src, val_tgt)
save_jsonl(processed_dir / "test_tokenized.jsonl", test_src, test_tgt)

print("Saved tokenized splits.")


Token IDs — PAD: 0 BOS: 1 EOS: 2 UNK: 3
Saved tokenized splits.
