### 🔑 Step 1: Load your vocabs


In [4]:
import json

with open("vocab_Urdu.json", "r", encoding="utf-8") as f:
    vocab_urdu = json.load(f)

with open("vocab_Roman.json", "r", encoding="utf-8") as f:
    vocab_roman = json.load(f)

# also build reverse maps if needed
id2urdu = {i: t for t, i in vocab_urdu.items()}
id2roman = {i: t for t, i in vocab_roman.items()}


### 🔑 Step 2: Load your dataset

In [5]:
with open("CombinedData/src.txt", encoding="utf-8") as f:
    urdu_lines = [ln.strip() for ln in f if ln.strip()]

with open("CombinedData/tgt_normalized.txt", encoding="utf-8") as f:
    roman_lines = [ln.strip() for ln in f if ln.strip()]

assert len(urdu_lines) == len(roman_lines), "Mismatch between Urdu and Roman lines!"


In [6]:
def encode_sentence(sentence, token2id, is_urdu=True):
    """
    Encode a sentence using greedy longest-match subword tokenization.
    - `token2id` is your BPE subword vocab (already contains things like "mohabbat_").
    - For Urdu: prepend "_" to each word.
    - For Roman: append "_" to each word.
    """
    tokens = []

    if is_urdu:
        words = ["_" + w for w in sentence.split()]   # Urdu side
    else:
        words = [w + "_" for w in sentence.split()]   # Roman side

    for w in words:
        i = 0
        while i < len(w):
            # try to find the longest subword starting at position i
            subword = None
            for j in range(len(w), i, -1):
                piece = w[i:j]
                if piece in token2id:
                    subword = piece
                    break
            if subword is None:
                tokens.append(token2id["<unk>"])
                i += 1
            else:
                tokens.append(token2id[subword])
                i += len(subword)

    # add special tokens
    return [token2id["<sos>"]] + tokens + [token2id["<eos>"]]


In [7]:
# ---------- Encode dataset ----------
src_ids, tgt_ids = [], []

for urdu, roman in zip(urdu_lines, roman_lines):
    s = encode_sentence(urdu, vocab_urdu, is_urdu=True)
    t = encode_sentence(roman, vocab_roman, is_urdu=False)
    src_ids.append(s)
    tgt_ids.append(t)



In [9]:
src_ids[:3]

[[4, 18, 159, 116, 383, 234, 316, 114, 159, 24, 364, 383, 84, 296, 2],
 [4, 239, 268, 289, 343, 326, 299, 365, 326, 299, 84, 296, 2],
 [4, 24, 364, 444, 203, 352, 462, 234, 316, 100, 435, 364, 189, 159, 36, 2]]

In [17]:
tgt_ids[:2]

[[15, 28, 409, 133, 333, 194, 125, 409, 472, 57, 225, 137, 13],
 [15, 481, 244, 271, 178, 170, 56, 432, 18, 178, 171, 225, 137, 13]]

In [None]:
import json
import torch

# Save as PyTorch tensors
torch.save({
    "src_ids": src_ids,
    "tgt_ids": tgt_ids
}, "encoded_dataset.pt")

print("Saved encoded dataset to encoded_dataset.pt")

# Optionally also save as JSON (easy to inspect/debug)
with open("encoded_dataset.json", "w", encoding="utf-8") as f:
    json.dump({
        "src_ids": src_ids[:5],  # save only first 5 for readability
        "tgt_ids": tgt_ids[:5]
    }, f, ensure_ascii=False, indent=2)

print("Preview saved to encoded_dataset.json")


In [None]:
import torch.nn as nn

embed_dim = 256
src_vocab_size = len(vocab_urdu)
tgt_vocab_size = len(vocab_roman)

# Source (Urdu) embedding
src_embedding = nn.Embedding(src_vocab_size, embed_dim, padding_idx=vocab_urdu["<pad>"])

# Target (Roman) embedding
tgt_embedding = nn.Embedding(tgt_vocab_size, embed_dim, padding_idx=vocab_roman["<pad>"])

# Example: take a batch of src_ids
batch_src = torch.tensor(src_ids[:2])  # first 2 sentences
embedded_src = src_embedding(batch_src)  # shape: (batch_size, seq_len, embed_dim)

print("Shape of embeddings:", embedded_src.shape)


In [3]:
def subword_tokenize(line, vocab):
    tokens = []
    i = 0
    text = line  # keep spaces
    while i < len(text):
        match = None
        for j in range(len(text), i, -1):
            sub = text[i:j]
            if sub in vocab:
                match = sub
                break
        if match:
            tokens.append(vocab[match])
            i += len(match)
        else:
            tokens.append(vocab.get('<unk>'))
            i += 1
    return tokens


In [4]:
src_tokenized = []
tgt_tokenized = []
with open('dataset/Combined/src_normalized.txt', 'r', encoding='utf-8') as src_file, \
     open('dataset/Combined/tgt_normalized.txt', 'r', encoding='utf-8') as tgt_file:
    for src_line, tgt_line in zip(src_file, tgt_file):
        src_tokenized.append(subword_tokenize(src_line, vocab_urdu))
        #tgt_tokenized.append(subword_tokenize(tgt_line, vocab_roman))

print("SRC:", src_tokenized[0])
#print("TGT:", tgt_tokenized[0])

SRC: [22, 432, 0, 229, 171, 185, 352, 482, 0, 167, 229, 39, 185, 128, 448, 5]


In [None]:
dlfgldflgd

In [17]:
import sentencepiece as spm

# Train SentencePiece model (BPE)
spm.SentencePieceTrainer.train(
    input='dataset/combined/tgt_normalized.txt',
    model_prefix='urdu_bpe',
    vocab_size=512,
    model_type='bpe',
    character_coverage=1.0,
    pad_id=0,
    unk_id=1,
    eos_id=2,
    bos_id=-1
)

# Load trained model
sp = spm.SentencePieceProcessor(model_file='urdu_bpe.model')

# Tokenize and encode each line in the dataset
with open('dataset/combined/tgt_normalized.txt', 'r', encoding='utf-8') as fin, \
     open('dataset/combined/tgt_tokenized.txt', 'w', encoding='utf-8') as fout:
    for line in fin:
        line = line.strip()
        ids = sp.encode(line, out_type=int)
        fout.write(' '.join(map(str, ids)) + '\n')

print("Tokenization complete. Encoded lines saved to src_tokenized.txt")

Tokenization complete. Encoded lines saved to src_tokenized.txt


In [16]:
for i in range(sp.get_piece_size()):
    print(f"{i}: {sp.id_to_piece(i)}")

0: <pad>
1: <unk>
2: </s>
3: ▁ک
4: ▁ہ
5: ▁م
6: یں
7: ▁ا
8: ▁ب
9: ▁ت
10: ▁س
11: ▁ن
12: ▁د
13: ▁ج
14: ▁ہے
15: یا
16: ▁پ
17: ▁گ
18: ▁ہو
19: ▁و
20: ▁میں
21: ▁آ
22: ▁ر
23: ▁نہ
24: ▁کو
25: ھی
26: ان
27: ▁سے
28: ▁کی
29: ار
30: وں
31: ▁خ
32: ▁کہ
33: ▁ل
34: ▁کے
35: ▁تو
36: اں
37: ▁چ
38: ▁ش
39: ئی
40: ▁کا
41: یر
42: ئے
43: ▁بھی
44: ▁ہیں
45: ▁ی
46: ھا
47: تا
48: ▁اس
49: ▁ہم
50: ▁کر
51: ▁نہیں
52: ▁کیا
53: ▁ع
54: ▁دل
55: تے
56: ور
57: ▁وہ
58: جھ
59: ▁یہ
60: نا
61: را
62: یک
63: ▁ح
64: ▁ز
65: ال
66: ات
67: ▁جو
68: ▁رہ
69: ▁جا
70: ھر
71: نے
72: ری
73: ▁ق
74: ▁ف
75: ام
76: ▁غ
77: ▁نے
78: ▁پر
79: ند
80: اہ
81: چھ
82: ▁خو
83: ▁تھا
84: ھو
85: ▁اب
86: ست
87: لا
88: ▁ص
89: ▁مجھ
90: ▁پہ
91: ▁ہوں
92: دا
93: ▁ہی
94: ▁دیک
95: ▁یا
96: ▁کوئی
97: رے
98: کھ
99: ▁ط
100: ▁ان
101: ▁تم
102: ▁بہ
103: ▁کس
104: ▁اپ
105: از
106: گر
107: ▁اور
108: تی
109: ▁کچھ
110: ▁جان
111: ▁میر
112: لے
113: ▁اک
114: ▁ہوا
115: ▁بے
116: ▁در
117: نی
118: اب
119: ▁رو
120: ▁دی
121: ▁سو
122: اد
123: نہ
124: ▁ہر
125: شق
126: ▁دیکھ
127: ▁پھر
128

In [None]:
svsvsd

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

data = torch.load("Encodings/4k/test.pt")
src = data["src"]            # (N, max_src_len) LongTensor
src_lengths = data["src_lengths"]
tgt = data["tgt"]            # (N, max_tgt_len)
tgt_lengths = data["tgt_lengths"]

dataset = TensorDataset(src, src_lengths, tgt, tgt_lengths)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
for batch in loader:
    src_batch, src_len_batch, tgt_batch, tgt_len_batch = batch
    # feed to model...


In [2]:
print(data)

{'src': tensor([[   1, 3295, 5969,  ...,    0,    0,    0],
        [   1,  861, 5222,  ...,    0,    0,    0],
        [   1, 6848, 6229,  ...,    0,    0,    0],
        ...,
        [   1, 5463, 5220,  ...,    0,    0,    0],
        [   1, 4436, 4455,  ...,    0,    0,    0],
        [   1, 5731, 5862,  ...,    0,    0,    0]]), 'src_lengths': tensor([10, 14, 15,  ..., 12, 14, 16]), 'tgt': tensor([[    1,  8249,  6925,  ...,     0,     0,     0],
        [    1,   280,  5713,  ...,     0,     0,     0],
        [    1, 11200,  6599,  ...,     0,     0,     0],
        ...,
        [    1,  6602,  5713,  ...,     0,     0,     0],
        [    1,  8748,  3561,  ...,     0,     0,     0],
        [    1,  6062,  4183,  ...,     0,     0,     0]]), 'tgt_lengths': tensor([14, 21, 17,  ..., 17, 21, 22]), 'src_token2id': {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, '!': 4, "'": 5, "''": 6, "''آپ": 7, '،': 8, 'ؔ': 9, '؟': 10, 'ء': 11, 'آ': 12, 'آؤ': 13, 'آؤں': 14, 'آئ': 15, 'آئنہ': 16

In [3]:
data.keys()

dict_keys(['src', 'src_lengths', 'tgt', 'tgt_lengths', 'src_token2id', 'tgt_token2id', 'merges_src', 'merges_tgt'])

In [4]:
src_lengths

tensor([10, 14, 15,  ..., 12, 14, 16])

In [5]:
tgt_lengths

tensor([14, 21, 17,  ..., 17, 21, 22])

In [7]:
data.values()

dict_values([tensor([[   1, 3295, 5969,  ...,    0,    0,    0],
        [   1,  861, 5222,  ...,    0,    0,    0],
        [   1, 6848, 6229,  ...,    0,    0,    0],
        ...,
        [   1, 5463, 5220,  ...,    0,    0,    0],
        [   1, 4436, 4455,  ...,    0,    0,    0],
        [   1, 5731, 5862,  ...,    0,    0,    0]]), tensor([10, 14, 15,  ..., 12, 14, 16]), tensor([[    1,  8249,  6925,  ...,     0,     0,     0],
        [    1,   280,  5713,  ...,     0,     0,     0],
        [    1, 11200,  6599,  ...,     0,     0,     0],
        ...,
        [    1,  6602,  5713,  ...,     0,     0,     0],
        [    1,  8748,  3561,  ...,     0,     0,     0],
        [    1,  6062,  4183,  ...,     0,     0,     0]]), tensor([14, 21, 17,  ..., 17, 21, 22]), {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, '!': 4, "'": 5, "''": 6, "''آپ": 7, '،': 8, 'ؔ': 9, '؟': 10, 'ء': 11, 'آ': 12, 'آؤ': 13, 'آؤں': 14, 'آئ': 15, 'آئنہ': 16, 'آئنے': 17, 'آئی': 18, 'آئین': 19, 'آئینہ': 20