In [2]:
import json


# =================================================
# Special Tokens (Never Merged)
# =================================================

EOS = "\u241E"   # ␞
EOP = "\u241D"   # ␝
EOT = "\u0003"   # End of Text

SPECIAL_TOKENS = [EOS, EOP, EOT]


# =================================================
# Train BPE
# =================================================

def train_bpe(texts, max_vocab_size=250):

    # ---------------------------------------------
    # Build Initial Character Vocab
    # ---------------------------------------------

    chars = set("".join(texts))

    for t in SPECIAL_TOKENS:
        chars.add(t)

    chars = sorted(chars)

    char2id = {c: i for i, c in enumerate(chars)}
    id2char = {i: c for c, i in char2id.items()}


    # Convert texts to ID sequences
    sequences = [
        [char2id[c] for c in text]
        for text in texts
    ]


    merges = {}        # new_id -> (a,b)
    next_id = len(char2id)

    special_ids = {char2id[t] for t in SPECIAL_TOKENS}


    print("Initial vocab size:", next_id)


    # ---------------------------------------------
    # BPE Loop
    # ---------------------------------------------

    while next_id < max_vocab_size:

        pair_counts = {}


        # Count pairs
        for seq in sequences:

            for pair in zip(seq, seq[1:]):

                # Skip special tokens
                if pair[0] in special_ids or pair[1] in special_ids:
                    continue

                pair_counts[pair] = pair_counts.get(pair, 0) + 1


        if not pair_counts:
            break


        # Most frequent pair
        best_pair = max(pair_counts, key=pair_counts.get)

        new_id = next_id
        next_id += 1

        merges[new_id] = best_pair


        # Replace in sequences
        new_sequences = []


        for seq in sequences:

            new_seq = []
            i = 0

            while i < len(seq):

                if i + 1 < len(seq) and (seq[i], seq[i+1]) == best_pair:

                    new_seq.append(new_id)
                    i += 2

                else:

                    new_seq.append(seq[i])
                    i += 1


            new_sequences.append(new_seq)


        sequences = new_sequences


    print("Final vocab size:", next_id)

    return merges, char2id, id2char


# =================================================
# Save / Load
# =================================================

def save_tokenizer(merges, char2id, id2char):

    # Save merges
    with open("merges.json", "w", encoding="utf-8") as f:

        json.dump(
            {str(k): v for k, v in merges.items()},
            f,
            indent=2
        )


    # Save vocab
    with open("vocab.json", "w", encoding="utf-8") as f:

        json.dump(
            char2id,
            f,
            ensure_ascii=False,
            indent=2
        )


    # Save reverse vocab
    with open("id2char.json", "w", encoding="utf-8") as f:

        json.dump(
            {str(k): v for k, v in id2char.items()},
            f,
            ensure_ascii=False,
            indent=2
        )


    print("Tokenizer saved.")


def load_tokenizer():

    # Load merges
    with open("merges.json", "r", encoding="utf-8") as f:

        merges_raw = json.load(f)

        merges = {
            int(k): tuple(v)
            for k, v in merges_raw.items()
        }


    # Load vocab
    with open("vocab.json", "r", encoding="utf-8") as f:

        char2id = json.load(f)


    # Load reverse vocab
    with open("id2char.json", "r", encoding="utf-8") as f:

        raw = json.load(f)

        id2char = {
            int(k): v
            for k, v in raw.items()
        }


    print("Tokenizer loaded.")

    return merges, char2id, id2char


# =================================================
# Tokenizer Class
# =================================================

class BPETokenizer:


    def __init__(self, merges, char2id, id2char):

        self.merges = merges
        self.char2id = char2id
        self.id2char = id2char


    # ---------------------------------------------
    # Encode
    # ---------------------------------------------

    def encode(self, text):

        tokens = []


        # Char → ID
        for c in text:

            if c in self.char2id:
                tokens.append(self.char2id[c])
            else:
                tokens.append(self.char2id[" "])


        changed = True


        # Apply merges
        while changed:

            changed = False
            new_tokens = []

            i = 0


            while i < len(tokens):

                merged = False


                for new_id, (a, b) in self.merges.items():

                    if i + 1 < len(tokens) and tokens[i] == a and tokens[i+1] == b:

                        new_tokens.append(new_id)

                        i += 2
                        merged = True
                        changed = True
                        break


                if not merged:

                    new_tokens.append(tokens[i])
                    i += 1


            tokens = new_tokens


        return tokens


    # ---------------------------------------------
    # Decode
    # ---------------------------------------------

    def decode(self, tokens):


        def expand(t):

            if t in self.merges:

                a, b = self.merges[t]
                return expand(a) + expand(b)

            return self.id2char[t]


        text = ""

        for t in tokens:
            text += expand(t)


        return text


# =================================================
# Main
# =================================================

if __name__ == "__main__":


    # ---------------------------------------------
    # Train (Run ONCE)
    # ---------------------------------------------

    with open("urdu_stories_cleaned.json", "r", encoding="utf-8") as f:

        data = json.load(f)

    texts = [item["content"] for item in data]


    merges, char2id, id2char = train_bpe(texts, 250)

    save_tokenizer(merges, char2id, id2char)



    # ---------------------------------------------
    # Load (Run EVERY TIME)
    # ---------------------------------------------

    merges, char2id, id2char = load_tokenizer()

    tokenizer = BPETokenizer(merges, char2id, id2char)



    # ---------------------------------------------
    # Test
    # ---------------------------------------------

    example = "ایک دفعہ کا ذکر ہے۔" + EOS

    encoded = tokenizer.encode(example)
    decoded = tokenizer.decode(encoded)


    print("\nOriginal:", example)
    print("Tokens:", encoded)
    print("Decoded:", decoded)


Initial vocab size: 65
Final vocab size: 250
Tokenizer saved.
Tokenizer loaded.

Original: ایک دفعہ کا ذکر ہے۔␞
Tokens: [118, 23, 35, 33, 72, 54, 69, 24, 54, 68, 58, 60, 61, 64]
Decoded: ایک دفعہ کا ذکر ہے۔␞


In [3]:
# Load tokenizer
merges, char2id, id2char = load_tokenizer()
tokenizer = BPETokenizer(merges, char2id, id2char)

# Load stories
with open("urdu_stories_cleaned.json", "r", encoding="utf-8") as f:
    stories = json.load(f)

print(f"Tokenizing {len(stories)} stories...")

# Tokenize all stories
tokenized_corpus = []
for i, story in enumerate(stories):
    token_ids = tokenizer.encode(story["content"])
    tokenized_corpus.append({
        "title": story["title"],
        "url": story["url"],
        "tokens": token_ids,
        "num_tokens": len(token_ids)
    })
    
    if (i + 1) % 50 == 0:
        print(f"  Tokenized {i + 1}/{len(stories)} stories...")

# Save tokenized corpus
with open("tokenized_corpus.json", "w", encoding="utf-8") as f:
    json.dump(tokenized_corpus, f, indent=2)

print(f"\nSaved tokenized corpus to 'tokenized_corpus.json'")
print(f"Total stories: {len(tokenized_corpus)}")
print(f"\nSample tokenized story:")
print(f"  Title: {tokenized_corpus[0]['title']}")
print(f"  Tokens (first 20): {tokenized_corpus[0]['tokens'][:20]}")
print(f"  Total tokens: {tokenized_corpus[0]['num_tokens']}")

Tokenizer loaded.
Tokenizing 200 stories...
  Tokenized 50/200 stories...
  Tokenized 100/200 stories...
  Tokenized 150/200 stories...
  Tokenized 200/200 stories...

Saved tokenized corpus to 'tokenized_corpus.json'
Total stories: 200

Sample tokenized story:
  Title: Billi Sher Ki Khala Hai - Article No. 2920
  Tokens (first 20): [28, 59, 68, 20, 210, 101, 125, 71, 12, 149, 90, 54, 68, 228, 78, 69, 54, 72, 212, 17]
  Total tokens: 1314
