In [1]:
# preprocess_bpe_from_scratch.py
from collections import Counter
from pathlib import Path


In [2]:
# ---------- Config ----------
ROOT = Path("CombinedData")
Urdu_FILE = ROOT / "src_normalized.txt"
Roman_FILE = ROOT / "tgt_normalized.txt" 

Urdu_VOCAB_SIZE = 512
Roman_VOCAB_SIZE = 512         
SPECIAL_TOKENS = ["<sos>","<pad>", "<eos>", "<unk>"]  



## Step 1: Reading Data from File

We start by reading the source and target text files using the `read_lines` function. This function loads the data, normalizes whitespace, and strips empty lines to prepare the corpus for further processing.

```python
Urdu_lines = read_lines(Urdu_FILE)
Roman_lines = read_lines(Roman_FILE)
```

In [3]:
def read_lines(path: Path):
    if not path.exists():
        return []
    text = path.read_text(encoding="utf-8")
    # normalize whitespace, rstrip
    lines = [' '.join(ln.split()).strip() for ln in text.splitlines() if ln.strip()]
    return lines


In [4]:
Urdu_lines = read_lines(Urdu_FILE,)
Roman_lines = read_lines(Roman_FILE)

In [5]:
# Get unique characters from source and target texts
Urdu_chars = set(''.join(Urdu_lines))
Roman_chars = set(''.join(Roman_lines))
# Remove space character from Urdu_chars and Roman_chars
Urdu_chars.discard(' ')
Roman_chars.discard(' ')
Urdu_chars.add('_')
Roman_chars.add('_')

print("Unique characters in Urdu text:", len(Urdu_chars))
print("Urdu characters:", sorted(list(Urdu_chars)))
print("Unique characters in Roman Urdu text:", len(Roman_chars))
print("Roman Urdu  characters:", sorted(list(Roman_chars)))

Unique characters in Urdu text: 55
Urdu characters: ['!', "'", '_', '،', 'ؔ', '؟', 'ء', 'آ', 'أ', 'ؤ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'و', 'ً', 'ٔ', 'ٰ', 'ٹ', 'پ', 'چ', 'ڈ', 'ڑ', 'ژ', 'ک', 'گ', 'ں', 'ھ', 'ہ', 'ۂ', 'ۃ', 'ی', 'ے', 'ۓ']
Unique characters in Roman Urdu text: 32
Roman Urdu  characters: ['!', "'", ',', '-', '?', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
print("Sample Urdu_lines:")
for line in Urdu_lines[20:25]:
    print(line)

print("\nSample Roman_lines:")
for line in Roman_lines[20:25]:
    print(line)

Sample Urdu_lines:
مانگے تانگے کی قبائیں دیر تک رہتی نہیں
یار لوگوں کے لقب القاب مت دیکھا کرو
تشنگی میں لب بھگو لینا بھی کافی ہے فرازؔ
جام میں صہبا ہے یا زہراب مت دیکھا کرو
اب اور کیا کسی سے مراسم بڑھائیں ہم

Sample Roman_lines:
mange-tange ki qabaen der tak rahti nahin
yaar logon ke laqab-alqab mat dekha karo
tishnagi men lab bhigo lena bhi kaafi hai 'faraz'
jaam men sahba hai ya zahrab mat dekha karo
ab aur kya kisi se marasim badhaen ham


In [25]:
print("Urdu dataset:")
print("Number of lines:", len(Urdu_lines))
print("Max sentence length:", max(len(line) for line in Urdu_lines))
print("Min sentence length:", min(len(line) for line in Urdu_lines))

print("\nRoman dataset:")
print("Number of lines:", len(Roman_lines))
print("Max sentence length:", max(len(line) for line in Roman_lines))
print("Min sentence length:", min(len(line) for line in Roman_lines))

Urdu dataset:
Number of lines: 20856
Max sentence length: 90
Min sentence length: 12

Roman dataset:
Number of lines: 20856
Max sentence length: 102
Min sentence length: 13


## Step 2: Building word freq  from list of lines
like:
#### 'haan': 120

In [7]:
def build_word_freqs(lines):
    """Return Counter of words from list of lines."""
    freqs = Counter()
    for ln in lines:
        for w in ln.split():
            freqs[w ] += 1
    return freqs

In [8]:
Urdu_freqs = build_word_freqs(Urdu_lines)
Roman_freqs = build_word_freqs(Roman_lines)

In [9]:
Urdu_freqs.most_common(10)

[('ہے', 6743),
 ('میں', 4652),
 ('سے', 3646),
 ('کے', 2846),
 ('کی', 2758),
 ('تو', 2657),
 ('کو', 2643),
 ('نہ', 2290),
 ('ہیں', 2263),
 ('بھی', 2215)]

In [10]:
Roman_freqs.most_common(10)

[('hai', 6741),
 ('ki', 4383),
 ('se', 3646),
 ('men', 3632),
 ('ke', 2843),
 ('ko', 2637),
 ('na', 2287),
 ('hain', 2262),
 ('bhi', 2215),
 ('to', 2165)]

In [11]:
print("count of unique words in Urdu:", len(Urdu_freqs))
print("count of unique words in Roman:", len(Roman_freqs))

count of unique words in Urdu: 10303
count of unique words in Roman: 16801


## Step 3: Byte Pair Encoding (BPE) from Word Frequencies

In this step, we implement the Byte Pair Encoding (BPE) algorithm to learn subword units from the word frequency data. This technique is widely used in natural language processing tasks to create a vocabulary of subwords, which helps in handling out-of-vocabulary words and improving the efficiency of tokenization.

### Key Functions:
1. **`get_stats(vocab)`**: Computes the frequency of symbol pairs in the vocabulary.
2. **`merge_vocab(pair, vocab)`**: Merges the most frequent pair of symbols in the vocabulary.
3. **`bpe_from_wordfreq(word_freqs, num_merges)`**: Trains the BPE model by iteratively merging the most frequent symbol pairs.

### Inputs:
- `word_freqs`: A `Counter` object containing word frequencies.
- `vocab_size`: The size of vocabulary size.

### Outputs:
- `vocab`: The vocabulary after applying BPE.


### Notes:
- The `train_bpe_from_wordfreq` function initializes the vocabulary by splitting each word into its constituent characters and appending the `</w>` end-of-word marker.
- The algorithm stops early if no more pairs can be merged.
- Progress is printed at regular intervals to monitor the training process.

This step is crucial for preparing the vocabulary and merges required for encoding words into subword units in subsequent steps.
```

In [12]:
def get_stats(data): 
    """Compute frequency of symbol pairs in the vocabulary."""
    pairs = Counter()
    for word, freq in data.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs

def merge_char_split_data(pair, char_split_data):
    """Merge the most frequent pair in the vocabulary."""
    a, b = pair
    new_symbol = a + b
    merged_char_split_data = {}
    for word, freq in char_split_data.items():
        new_word = word.replace(' '.join(pair), new_symbol)
        merged_char_split_data[new_word] = freq
    return merged_char_split_data

def train_bpe(vocab_size, char_split_data):
    """
    Train BPE with a stopping criterion based on vocabulary size.
    Uses '_' as space marker, no </w>.
    """
    vocab = []
    
    while len(vocab) < vocab_size:
        pairs = get_stats(char_split_data)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab.append(best)
        char_split_data = merge_char_split_data(best, char_split_data)
        print(f"New pair: {best}, Vocab size: {len(vocab)}")
    
    return vocab, char_split_data



In [13]:
# Roman side → underscore at the END of each word
char_split_data_Roman = {
    ' '.join(list(word)) + ' _': freq 
    for word, freq in Roman_freqs.items()
}

# Urdu side → underscore at the BEGINNING of each word
char_split_data_Urdu = {
    '_ ' + ' '.join(list(word)): freq 
    for word, freq in Urdu_freqs.items()
}

In [14]:
print("Training BPE for Roman Urdu (target) ....")
vocab_Roman, corpus_Roman = train_bpe(
    Roman_VOCAB_SIZE - len(SPECIAL_TOKENS) - len(Roman_chars),
    char_split_data_Roman,
)

Training BPE for Roman Urdu (target) ....
New pair: ('h', 'a'), Vocab size: 1
New pair: ('i', '_'), Vocab size: 2
New pair: ('e', '_'), Vocab size: 3
New pair: ('n', '_'), Vocab size: 4
New pair: ('a', '_'), Vocab size: 5
New pair: ('a', 'r'), Vocab size: 6
New pair: ('o', '_'), Vocab size: 7
New pair: ('e', '-'), Vocab size: 8
New pair: ('-', 'e-'), Vocab size: 9
New pair: ('a', 'a'), Vocab size: 10
New pair: ('a', 'h'), Vocab size: 11
New pair: ('i', 'n_'), Vocab size: 12
New pair: ('ha', 'i_'), Vocab size: 13
New pair: ('ar', '_'), Vocab size: 14
New pair: ('m', 'a'), Vocab size: 15
New pair: ('e', 'n_'), Vocab size: 16
New pair: ('h', 'u'), Vocab size: 17
New pair: ('m', '_'), Vocab size: 18
New pair: ('a', 'b'), Vocab size: 19
New pair: ('i', 's'), Vocab size: 20
New pair: ('n', 'a_'), Vocab size: 21
New pair: ('h', 'i_'), Vocab size: 22
New pair: ('h', '_'), Vocab size: 23
New pair: ('k', 'i_'), Vocab size: 24
New pair: ('i', 'r'), Vocab size: 25
New pair: ('l', '_'), Vocab size:

In [15]:
print("Training BPE for Urdu (source) ....")
vocab_Urdu, corpus_Urdu = train_bpe(
    Urdu_VOCAB_SIZE - len(SPECIAL_TOKENS) - len(Urdu_chars),
    char_split_data_Urdu,
)

Training BPE for Urdu (source) ....
New pair: ('_', 'ک'), Vocab size: 1
New pair: ('_', 'ہ'), Vocab size: 2
New pair: ('_', 'م'), Vocab size: 3
New pair: ('ی', 'ں'), Vocab size: 4
New pair: ('_', 'ا'), Vocab size: 5
New pair: ('_', 'ب'), Vocab size: 6
New pair: ('_', 'ت'), Vocab size: 7
New pair: ('_', 'س'), Vocab size: 8
New pair: ('_', 'ن'), Vocab size: 9
New pair: ('_', 'د'), Vocab size: 10
New pair: ('_', 'ج'), Vocab size: 11
New pair: ('_ہ', 'ے'), Vocab size: 12
New pair: ('ی', 'ا'), Vocab size: 13
New pair: ('_', 'پ'), Vocab size: 14
New pair: ('_', 'گ'), Vocab size: 15
New pair: ('_ہ', 'و'), Vocab size: 16
New pair: ('_', 'و'), Vocab size: 17
New pair: ('_م', 'یں'), Vocab size: 18
New pair: ('_', 'آ'), Vocab size: 19
New pair: ('_', 'ر'), Vocab size: 20
New pair: ('_ن', 'ہ'), Vocab size: 21
New pair: ('_ک', 'و'), Vocab size: 22
New pair: ('ھ', 'ی'), Vocab size: 23
New pair: ('ا', 'ن'), Vocab size: 24
New pair: ('_س', 'ے'), Vocab size: 25
New pair: ('_ک', 'ی'), Vocab size: 26
New

In [16]:
vocab_Urdu= [a+b for a,b in vocab_Urdu] 
vocab_Roman= [a+b for a,b in vocab_Roman]

In [17]:
print("Urdu subword numebers:", len(vocab_Urdu))
print("Roman subword numbers:", len(vocab_Roman))

Urdu subword numebers: 453
Roman subword numbers: 476


In [18]:
vocab_Urdu = list(Urdu_chars) + vocab_Urdu + SPECIAL_TOKENS
vocab_Roman = list(Roman_chars) + vocab_Roman + SPECIAL_TOKENS

In [19]:
print("Urdu vocabulary size:", len(vocab_Urdu))
print("Roman vocabulary size:", len(vocab_Roman))

Urdu vocabulary size: 512
Roman vocabulary size: 512


In [20]:
def token2id(tokens):
    token2id = {}
    next_id = 0
    for tok in sorted(tokens):
        if tok in token2id:
            continue
        token2id[tok] = next_id
        next_id += 1
    return token2id

In [21]:
Token_Urdu = token2id(vocab_Urdu)
Token_Roman = token2id(vocab_Roman)

In [22]:
print(Token_Urdu  )

{'!': 0, "'": 1, '<eos>': 2, '<pad>': 3, '<sos>': 4, '<unk>': 5, '_': 6, '_آ': 7, '_آئی': 8, '_آئے': 9, '_آب': 10, '_آتا': 11, '_آتے': 12, '_آج': 13, '_آخ': 14, '_آخر': 15, '_آدم': 16, '_آش': 17, '_آنکھ': 18, '_آنکھوں': 19, '_آپ': 20, '_آگ': 21, '_آہ': 22, '_آیا': 23, '_ا': 24, '_اب': 25, '_ابھی': 26, '_اح': 27, '_اس': 28, '_اسے': 29, '_انت': 30, '_اور': 31, '_اٹ': 32, '_اٹھا': 33, '_اپ': 34, '_اپنا': 35, '_اپنی': 36, '_اپنے': 37, '_اک': 38, '_اگر': 39, '_اہل': 40, '_ایس': 41, '_ایسے': 42, '_ایک': 43, '_اے': 44, '_ب': 45, '_با': 46, '_بات': 47, '_بد': 48, '_بر': 49, '_بزم': 50, '_بس': 51, '_بعد': 52, '_بن': 53, '_بو': 54, '_بچ': 55, '_بڑ': 56, '_بھ': 57, '_بھر': 58, '_بھی': 59, '_بہ': 60, '_بہار': 61, '_بہت': 62, '_بی': 63, '_بیٹ': 64, '_بے': 65, '_ت': 66, '_تجھ': 67, '_تر': 68, '_ترا': 69, '_تری': 70, '_ترے': 71, '_تم': 72, '_تمہ': 73, '_تمہیں': 74, '_تنہا': 75, '_تو': 76, '_تک': 77, '_تھا': 78, '_تھی': 79, '_تھے': 80, '_ج': 81, '_جا': 82, '_جائیں': 83, '_جائے': 84, '_جب': 85, '_جس': 

In [23]:
print(Token_Roman  )

{'!': 0, "'": 1, "'_": 2, "'asa": 3, "'asad": 4, "'asad'_": 5, "'ghali": 6, "'ghalib'_": 7, ',': 8, '-': 9, '-e-': 10, '-e-a': 11, '-o-': 12, '<eos>': 13, '<pad>': 14, '<sos>': 15, '<unk>': 16, '?': 17, '_': 18, 'a': 19, 'a_': 20, 'aa': 21, 'aad': 22, 'aadmi_': 23, 'aae_': 24, 'aage_': 25, 'aaj_': 26, 'aalam_': 27, 'aankh_': 28, 'aap_': 29, 'aata_': 30, 'aate_': 31, 'aati_': 32, 'aaya_': 33, 'ab': 34, 'ab_': 35, 'ach': 36, 'achch': 37, 'ada_': 38, 'af': 39, 'agar_': 40, 'ah': 41, 'ahl-e-': 42, 'ai_': 43, 'aisa_': 44, 'aise_': 45, 'aisi_': 46, 'ak': 47, 'an': 48, 'anda': 49, 'ankhen_': 50, 'ankhon_': 51, 'ap': 52, 'apna_': 53, 'apne_': 54, 'apni_': 55, 'ar': 56, 'ar_': 57, 'as': 58, 'ash': 59, 'asman_': 60, 'aur_': 61, 'az': 62, 'b': 63, "b'_": 64, 'b-e-': 65, 'b_': 66, 'ba': 67, 'ba-': 68, 'baad_': 69, 'baaqi_': 70, 'baat_': 71, 'bad': 72, 'bahut_': 73, 'baithe_': 74, 'bat_': 75, 'baten_': 76, 'baz': 77, 'be': 78, 'be-': 79, 'bha': 80, 'bhi_': 81, 'bhu': 82, 'bi': 83, 'bo': 84, 'bu': 8

In [24]:
import json

with open("vocab_Urdu.json", "w", encoding="utf-8") as f:
    json.dump(Token_Urdu, f, ensure_ascii=False, indent=2)

with open("vocab_Roman.json", "w", encoding="utf-8") as f:
    json.dump(Token_Roman, f, ensure_ascii=False, indent=2)