In [1]:
import re
import json
from collections import Counter, defaultdict

In [2]:
def load_dataset(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        text=f.read()
    
    text=re.sub(r'<EOS>|<EOP>|<EOT>', '', text)
    text=re.sub(r'\s+', ' ', text).strip()
    return text

dataset=load_dataset('../scrapping/dataset_clean.txt')
print(f"Dataset loaded: {len(dataset)} characters\n")

Dataset loaded: 2289812 characters



In [3]:
def get_pair_frequencies(word_freqs):
    pairs=defaultdict(int)
    for word, freq in word_freqs.items():
        chars=word.split()
        for i in range(len(chars) - 1):
            pairs[(chars[i], chars[i + 1])] += freq
    return pairs

In [4]:
def merge_pair(pair, word_freqs):
    new_word_freqs={}
    old_pair=' '.join(pair)
    new_token=''.join(pair)
    
    for word, freq in word_freqs.items():
        new_word=word.replace(old_pair, new_token)
        new_word_freqs[new_word]=freq
    
    return new_word_freqs

In [5]:
def train_bpe(text, vocab_size=250):
    # Get all unique characters
    all_chars=set(text.replace(' ', ''))
    print(f"Unique characters: {len(all_chars)}")
    
    # Split text into words and add spaces between characters
    word_freqs=Counter()
    for word in text.split():
        word_with_spaces=' '.join(word)
        word_freqs[word_with_spaces] += 1
    
    # Calculate how many merges we need
    num_merges=vocab_size - len(all_chars) - 2  # -2 for space and <UNK>
    merges=[]
    
    # Perform merges
    for i in range(num_merges):
        pairs=get_pair_frequencies(word_freqs)
        if not pairs:
            break
        
        # Find most frequent pair and merge it
        best_pair=max(pairs, key=pairs.get)
        word_freqs=merge_pair(best_pair, word_freqs)
        merges.append(best_pair)
        
        if (i + 1) % 10 == 0:
            print(f"Merge {i + 1}: {best_pair[0]} + {best_pair[1]} -> {''.join(best_pair)}")
    
    # Build vocabulary
    vocab=set(all_chars)  # All characters
    vocab.add(' ')  # Space
    for pair in merges:
        vocab.add(''.join(pair))  # Merged tokens
    
    # Create token->ID mapping
    vocab_dict={token: idx for idx, token in enumerate(sorted(vocab))}
    vocab_dict['<UNK>']=len(vocab_dict)
    
    print(f"\nVocabulary size: {len(vocab_dict)}")
    return vocab_dict, merges

In [6]:
def encode(text, vocab, merges):
    token_ids=[]
    
    for word_idx, word in enumerate(text.split()):
        # Start with individual characters
        tokens=list(word)
        
        # Apply each merge in order
        for pair in merges:
            new_tokens=[]
            i=0
            while i < len(tokens):
                if i < len(tokens) - 1 and tokens[i] == pair[0] and tokens[i + 1] == pair[1]:
                    new_tokens.append(''.join(pair))
                    i += 2
                else:
                    new_tokens.append(tokens[i])
                    i += 1
            tokens=new_tokens
        
        # Convert tokens to IDs
        for token in tokens:
            token_ids.append(vocab.get(token, vocab['<UNK>']))
        
        # Add space between words
        if word_idx < len(text.split()) - 1:
            token_ids.append(vocab[' '])
    
    return token_ids

In [7]:
def decode(token_ids, vocab):
    # Reverse mapping: ID->token
    id_to_token={idx: token for token, idx in vocab.items()}
    
    # Convert IDs to tokens and join
    tokens=[id_to_token.get(idx, '<UNK>') for idx in token_ids]
    return ''.join(tokens)

In [8]:
def save_tokenizer(vocab, merges, filepath):
    data={
        'vocab': vocab,
        'merges': [(a, b) for a, b in merges]
    }
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Saved to {filepath}")

In [9]:
def load_tokenizer(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        data=json.load(f)
    
    vocab=data['vocab']
    merges=[tuple(pair) for pair in data['merges']]
    print(f"Loaded from {filepath}")
    return vocab, merges

In [10]:
# TRAIN
vocab, merges=train_bpe(dataset, vocab_size=250)
save_tokenizer(vocab, merges, 'urdu_bpe_tokenizer.json')

# TEST

# Test 1
test_text="یہ ایک ٹیسٹ ہے"
print(f"\nTest 1:")
print(f"Original: {test_text}")

encoded=encode(test_text, vocab, merges)
print(f"Encoded: {encoded}")

decoded=decode(encoded, vocab)
print(f"Decoded: {decoded}")
print(f"Match: {test_text == decoded} ✓\n")

# Test 2
sample=dataset[:100]
print(f"Test 2:")
print(f"Original: {sample}")

encoded=encode(sample, vocab, merges)
print(f"Encoded ({len(encoded)} tokens): {encoded[:20]}...")

decoded=decode(encoded, vocab)
print(f"Decoded: {decoded}")
print(f"Match: {sample == decoded} ✓\n")

# Show vocabulary
print("Vocabulary samples (first 20):")
for token, idx in list(vocab.items())[:20]:
    print(f"  {idx}: '{token}'")

Unique characters: 60
Merge 10: ی + ا -> یا
Merge 20: ا + ر -> ار
Merge 30: ک + ھ -> کھ
Merge 40: ا + یک -> ایک
Merge 50: ر + ی -> ری
Merge 60: ج + ھ -> جھ
Merge 70: خ + و -> خو
Merge 80: س + و -> سو
Merge 90: د + ا -> دا
Merge 100: ر + ہا -> رہا
Merge 110: و + ن -> ون
Merge 120: چ + ی -> چی
Merge 130: و + ا -> وا
Merge 140: ڑ + ے -> ڑے
Merge 150: پ + و -> پو
Merge 160: طر + ف -> طرف
Merge 170: ا + ت -> ات
Merge 180: بی + ٹھ -> بیٹھ

Vocabulary size: 250
Saved to urdu_bpe_tokenizer.json

Test 1:
Original: یہ ایک ٹیسٹ ہے
Encoded: [242, 0, 34, 0, 179, 101, 176, 0, 236]
Decoded: یہ ایک ٹیسٹ ہے
Match: True ✓

Test 2:
Original: غرور کی سزا احسن ساتویں جماعت کا طالب علم تھا۔ وہ اپنے ماں باپ کے ساتھ دریا کنارے چھوٹے سے گھر میں ر
Encoded (66 tokens): [125, 90, 163, 0, 214, 0, 101, 99, 0, 19, 106, 0, 102, 56, 241, 0, 68, 140, 123, 53]...
Decoded: غرور کی سزا احسن ساتویں جماعت کا طالب علم تھا۔ وہ اپنے ماں باپ کے ساتھ دریا کنارے چھوٹے سے گھر میں ر
Match: True ✓

Vocabulary samples (first 20):
  0