In [3]:
import json

# =================================================
# Special Tokens (Never Merged)
# =================================================

EOS = "\u241E"   # ␞
EOP = "\u241D"   # ␝
EOT = "\u0003"   # End of Text

SPECIAL_TOKENS = [EOS, EOP, EOT]


# =================================================
# Train BPE
# =================================================

def train_bpe(texts, max_vocab_size=250):

    # ---------------------------------------------
    # Initial Character Vocabulary
    # ---------------------------------------------

    chars = set("".join(texts))
    for t in SPECIAL_TOKENS:
        chars.add(t)

    chars = sorted(chars)

    char2id = {c: i for i, c in enumerate(chars)}
    id2char = {i: c for c, i in char2id.items()}

    sequences = [
        [char2id[c] for c in text]
        for text in texts
    ]

    merges = []  # ordered list of (new_id, (a,b))
    next_id = len(char2id)

    special_ids = {char2id[t] for t in SPECIAL_TOKENS}

    print("Initial vocab size:", next_id)

    # ---------------------------------------------
    # BPE Loop
    # ---------------------------------------------

    while next_id < max_vocab_size:

        pair_counts = {}

        for seq in sequences:
            for pair in zip(seq, seq[1:]):
                if pair[0] in special_ids or pair[1] in special_ids:
                    continue
                pair_counts[pair] = pair_counts.get(pair, 0) + 1

        if not pair_counts:
            break

        best_pair = max(pair_counts, key=pair_counts.get)

        new_id = next_id
        next_id += 1

        a, b = best_pair

        # Create token string
        token_str = id2char[a] + id2char[b]

        # Update vocab
        char2id[token_str] = new_id
        id2char[new_id] = token_str

        # Store merge in order
        merges.append((new_id, best_pair))

        # Replace in sequences
        new_sequences = []

        for seq in sequences:
            new_seq = []
            i = 0

            while i < len(seq):
                if i + 1 < len(seq) and (seq[i], seq[i+1]) == best_pair:
                    new_seq.append(new_id)
                    i += 2
                else:
                    new_seq.append(seq[i])
                    i += 1

            new_sequences.append(new_seq)

        sequences = new_sequences

    print("Final vocab size:", next_id)

    return merges, char2id, id2char


# =================================================
# Save / Load
# =================================================

def save_tokenizer(merges, char2id, id2char):

    # Save merges as ordered list
    with open("merges.json", "w", encoding="utf-8") as f:
        json.dump(
            [(new_id, list(pair)) for new_id, pair in merges],
            f,
            indent=2
        )

    with open("vocab.json", "w", encoding="utf-8") as f:
        json.dump(char2id, f, ensure_ascii=False, indent=2)

    with open("id2char.json", "w", encoding="utf-8") as f:
        json.dump(
            {str(k): v for k, v in id2char.items()},
            f,
            ensure_ascii=False,
            indent=2
        )

    print("Tokenizer saved.")


def load_tokenizer():

    with open("merges.json", "r", encoding="utf-8") as f:
        merges_raw = json.load(f)
        merges = [
            (int(new_id), tuple(pair))
            for new_id, pair in merges_raw
        ]

    with open("vocab.json", "r", encoding="utf-8") as f:
        char2id = json.load(f)

    with open("id2char.json", "r", encoding="utf-8") as f:
        raw = json.load(f)
        id2char = {int(k): v for k, v in raw.items()}

    print("Tokenizer loaded.")

    return merges, char2id, id2char


# =================================================
# Tokenizer Class
# =================================================

class BPETokenizer:

    def __init__(self, merges, char2id, id2char):
        self.merges = merges
        self.char2id = char2id
        self.id2char = id2char
        self.merge_dict = {new_id: pair for new_id, pair in merges}

    # ---------------------------------------------
    # Encode (Oldest → Newest)
    # ---------------------------------------------

    def encode(self, text):

        tokens = []

        for c in text:
            if c in self.char2id:
                tokens.append(self.char2id[c])
            else:
                tokens.append(self.char2id.get(" ", 0))

        # Apply merges in order
        for new_id, (a, b) in self.merges:

            new_tokens = []
            i = 0

            while i < len(tokens):
                if i + 1 < len(tokens) and tokens[i] == a and tokens[i+1] == b:
                    new_tokens.append(new_id)
                    i += 2
                else:
                    new_tokens.append(tokens[i])
                    i += 1

            tokens = new_tokens

        return tokens

    # ---------------------------------------------
    # Decode (Newest → Oldest via recursion)
    # ---------------------------------------------

    def decode(self, tokens):

        def expand(t):
            if t in self.merge_dict:
                a, b = self.merge_dict[t]
                return expand(a) + expand(b)
            return self.id2char[t]

        return "".join(expand(t) for t in tokens)


# =================================================
# Main
# =================================================

if __name__ == "__main__":

    with open("urdu_stories_cleaned.json", "r", encoding="utf-8") as f:
        data = json.load(f)

    texts = [item["content"] for item in data]

    merges, char2id, id2char = train_bpe(texts, 250)

    save_tokenizer(merges, char2id, id2char)

    merges, char2id, id2char = load_tokenizer()

    tokenizer = BPETokenizer(merges, char2id, id2char)

    example = "ایک دفعہ کا ذکر ہے۔" + EOS

    encoded = tokenizer.encode(example)
    decoded = tokenizer.decode(encoded)

    print("\nOriginal:", example)
    print("Tokens:", encoded)
    print("Decoded:", decoded)

Initial vocab size: 69
Final vocab size: 250
Tokenizer saved.
Tokenizer loaded.

Original: ایک دفعہ کا ذکر ہے۔␞
Tokens: [121, 26, 38, 36, 61, 124, 27, 57, 72, 215, 64, 67]
Decoded: ایک دفعہ کا ذکر ہے۔␞


In [4]:
# Load tokenizer
merges, char2id, id2char = load_tokenizer()
tokenizer = BPETokenizer(merges, char2id, id2char)

# Load stories
with open("urdu_stories_cleaned.json", "r", encoding="utf-8") as f:
    stories = json.load(f)

print(f"Tokenizing {len(stories)} stories...")

# Tokenize all stories
tokenized_corpus = []
for i, story in enumerate(stories):
    token_ids = tokenizer.encode(story["content"])
    tokenized_corpus.append({
        "title": story["title"],
        "url": story["url"],
        "tokens": token_ids,
        "num_tokens": len(token_ids)
    })
    
    if (i + 1) % 50 == 0:
        print(f"  Tokenized {i + 1}/{len(stories)} stories...")

# Save tokenized corpus
with open("tokenized_corpus.json", "w", encoding="utf-8") as f:
    json.dump(tokenized_corpus, f, indent=2)

print(f"\nSaved tokenized corpus to 'tokenized_corpus.json'")
print(f"Total stories: {len(tokenized_corpus)}")
print(f"\nSample tokenized story:")
print(f"  Title: {tokenized_corpus[0]['title']}")
print(f"  Tokens (first 20): {tokenized_corpus[0]['tokens'][:20]}")
print(f"  Total tokens: {tokenized_corpus[0]['num_tokens']}")

Tokenizer loaded.
Tokenizing 500 stories...
  Tokenized 50/500 stories...
  Tokenized 100/500 stories...
  Tokenized 150/500 stories...
  Tokenized 200/500 stories...
  Tokenized 250/500 stories...
  Tokenized 300/500 stories...
  Tokenized 350/500 stories...
  Tokenized 400/500 stories...
  Tokenized 450/500 stories...
  Tokenized 500/500 stories...

Saved tokenized corpus to 'tokenized_corpus.json'
Total stories: 500

Sample tokenized story:
  Title: Allah Ka Dost - Article No. 2923
  Tokens (first 20): [84, 121, 82, 195, 107, 90, 75, 91, 129, 28, 73, 41, 33, 41, 43, 103, 82, 40, 137, 116]
  Total tokens: 2085
