In [None]:
import re
import collections
import time


LOCAL_DATA_FILE_PATH = "/Users/adityakumar/Desktop/college_labs/NLP/assignment 9/gu.txt" 

# We'll use 10,000 samples and 1,000 merges/vocab size for a quick test.
NUM_SAMPLES_FOR_TRAINING = 10000
NUM_MERGES_BPE = 1000
TARGET_VOCAB_SIZE_WP = 1000
# -----------------------------------


# =============================================================================
# 1. BYTE PAIR ENCODING (BPE)
# =============================================================================

def get_word_counts_bpe(corpus):
    """
    Pre-tokenizes a raw text corpus into a dictionary of word counts.
    Adds a space between characters and an end-of-word token.
    """
    word_counts = collections.Counter()
    for text in corpus:
        # This regex handles words (including Indic scripts) and punctuation
        words = re.findall(r'\w+|[^\w\s]', text)
        for word in words:
            # Add the end-of-word token </w>
            word_counts[' '.join(list(word)) + ' </w>'] += 1
    return word_counts

def get_pairs_bpe(word_counts):
    """
    Finds the frequency of all adjacent token pairs in the vocabulary.
    """
    pairs = collections.Counter()
    for word, count in word_counts.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i+1])] += count
    return pairs

def merge_pair_bpe(best_pair, word_counts):
    """
    Merges the most frequent pair in all words of the vocabulary.
    """
    new_word_counts = collections.Counter()
    new_token = ''.join(best_pair)
    
    # We need a regex to safely replace the pair
    # (?<!\S) and (?!\S) are negative look-behind/ahead for whitespace
    # This ensures we only merge whole tokens.
    pattern = r'(?<!\S)' + re.escape(best_pair[0]) + r'\s+' + re.escape(best_pair[1]) + r'(?!\S)'
    
    for word, count in word_counts.items():
        # Replace all occurrences of the pair with the new merged token
        new_word = re.sub(pattern, new_token, word)
        new_word_counts[new_word] += count
        
    return new_word_counts

def train_bpe(corpus, num_merges):
    """
    Trains a BPE tokenizer from a corpus.
    """
    print("BPE: Getting word counts...")
    # 1. Get word frequencies
    word_counts = get_word_counts_bpe(corpus)
    
    # 2. Get initial vocabulary (all unique characters)
    vocab = set()
    for word in word_counts:
        vocab.update(word.split())
    print(f"BPE: Initial vocab size: {len(vocab)}")
        
    merge_rules = []
    
    # 3. Iterate for num_merges
    for i in range(num_merges):
        # 3a. Get all adjacent pairs
        pairs = get_pairs_bpe(word_counts)
        
        if not pairs:
            print("BPE: No more pairs to merge.")
            break
            
        # 3b. Find the most frequent pair
        best_pair = max(pairs, key=pairs.get)
        
        # 3c. Merge the pair and add to vocab
        new_token = ''.join(best_pair)
        vocab.add(new_token)
        merge_rules.append(best_pair)
        
        # 3d. Update all word representations
        word_counts = merge_pair_bpe(best_pair, word_counts)
        
        if (i + 1) % 100 == 0 or i == 0:
            print(f"BPE: Merge step {i+1}/{num_merges} - Best pair: {best_pair} -> {new_token}")

    print("BPE: Training complete.")
    return vocab, merge_rules

def tokenize_bpe(text, merge_rules):
    """
    Tokenizes new text using the learned BPE merge rules.
    """
    # 1. Pre-tokenize into words (using the same regex as training)
    words = re.findall(r'\w+|[^\w\s]', text)
    
    tokenized_output = []
    for word in words:
        # 2. Represent word as chars + </w>
        tokens = list(word) + ['</w>']
        
        # 3. Apply all merge rules in order
        for pair in merge_rules:
            new_tokens = []
            i = 0
            while i < len(tokens):
                # Check if the current and next token form the pair
                if i < len(tokens) - 1 and (tokens[i], tokens[i+1]) == pair:
                    new_tokens.append(''.join(pair))
                    i += 2
                else:
                    new_tokens.append(tokens[i])
                    i += 1
            tokens = new_tokens
            
        tokenized_output.extend(tokens)
        
    return tokenized_output

# =============================================================================
# 2. WORDPIECE
# =============================================================================

def get_word_counts_wp(corpus):
    """
    Pre-tokenizes corpus into word counts.
    WordPiece splits words into characters.
    """
    word_counts = collections.Counter()
    for text in corpus:
        words = re.findall(r'\w+|[^\w\s]', text)
        for word in words:
            # We just split into characters. No '</w>'
            word_counts[' '.join(list(word))] += 1
    return word_counts

def get_stats_wp(word_counts):
    """
    Gets counts for both individual tokens and adjacent pairs.
    """
    pairs = collections.Counter()
    token_counts = collections.Counter()
    
    for word, count in word_counts.items():
        symbols = word.split()
        for i in range(len(symbols)):
            token_counts[symbols[i]] += count
            if i < len(symbols) - 1:
                pairs[(symbols[i], symbols[i+1])] += count
    return pairs, token_counts

def merge_pair_wp(best_pair, word_counts):
    """
    Merges the best pair in all words (same as BPE's merge).
    """
    new_word_counts = collections.Counter()
    new_token = ''.join(best_pair)
    
    pattern = r'(?<!\S)' + re.escape(best_pair[0]) + r'\s+' + re.escape(best_pair[1]) + r'(?!\S)'
    
    for word, count in word_counts.items():
        new_word = re.sub(pattern, new_token, word)
        new_word_counts[new_word] += count
        
    return new_word_counts

def train_wordpiece(corpus, vocab_size):
    """
    Trains a WordPiece tokenizer from a corpus.
    """
    print("WordPiece: Getting word counts...")
    # 1. Get word frequencies
    word_counts = get_word_counts_wp(corpus)
    
    # 2. Get initial vocabulary (all unique characters)
    vocab = set()
    for word in word_counts:
        vocab.update(word.split())
    
    # Add special tokens
    special_tokens = ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']
    for token in special_tokens:
        if token not in vocab:
            vocab.add(token)
            
    print(f"WordPiece: Initial vocab size: {len(vocab)}")
    
    # We iterate until we reach the target vocab size
    num_merges = vocab_size - len(vocab)
    if num_merges <= 0:
        print(f"WordPiece: Target vocab size ({vocab_size}) is smaller than initial vocab size ({len(vocab)}). No merges needed.")
        return vocab
    
    print(f"WordPiece: Will perform {num_merges} merges.")
    
    for i in range(num_merges):
        # 3a. Get pair and individual token counts
        pairs, token_counts = get_stats_wp(word_counts)
        
        if not pairs:
            print("WordPiece: No more pairs to merge.")
            break

        # 3b. Find the best pair based on likelihood score
        best_pair = ('', '')
        max_score = -1.0
        
        for pair, count in pairs.items():
            token_a, token_b = pair
            # Ensure we don't divide by zero
            if token_counts[token_a] > 0 and token_counts[token_b] > 0:
                # Score = freq(pair) / (freq(A) * freq(B))
                score = count / (token_counts[token_a] * token_counts[token_b])
            else:
                score = 0.0
            
            if score > max_score:
                max_score = score
                best_pair = pair
        
        if max_score == -1.0:
            print("WordPiece: No valid pairs found (max_score = -1).")
            break

        # 3c. Merge the pair and add to vocab
        new_token = ''.join(best_pair)
        vocab.add(new_token)
        
        # 3d. Update all word representations
        word_counts = merge_pair_wp(best_pair, word_counts)
        
        if (i + 1) % 100 == 0 or i == 0:
            print(f"WordPiece: Merge {i+1}/{num_merges} - Vocab size: {len(vocab)} - Best pair: {best_pair} -> {new_token}")
            
    print("WordPiece: Training complete.")
    return vocab

def tokenize_wordpiece(text, vocab, unk_token="[UNK]"):
    """
    Tokenizes new text using the learned WordPiece vocabulary.
    Uses a greedy, longest-match-first approach.
    """
    
    # 1. Pre-tokenize into words
    words = re.findall(r'\w+|[^\w\s]', text)
    
    output_tokens = []
    
    for word in words:
        # If the whole word is in our vocab, great!
        if word in vocab:
            output_tokens.append(word)
            continue
            
        word_tokens = []
        start = 0
        while start < len(word):
            # Find the longest subword in vocab that matches from `start`
            end = len(word)
            best_subword = None
            
            # Greedily check from longest possible (end) to shortest (start+1)
            while end > start:
                sub = word[start:end]
                # Note: WordPiece vocab doesn't store '##' prefixes
                if sub in vocab:
                    best_subword = sub
                    break # Found the longest valid subword
                end -= 1
            
            # Case 1: No subword found (not even a single character)
            if best_subword is None:
                # This means the character at word[start] is not in our vocab
                # This is a true "unknown"
                word_tokens = [unk_token] # The whole word is un-tokenizable
                break # Stop processing this word
            
            # Case 2: We found a subword
            if start == 0:
                # It's the first piece of the word
                word_tokens.append(best_subword)
            else:
                # It's a subsequent piece, add the '##' prefix
                word_tokens.append("##" + best_subword)
            
            # Move our `start` pointer to the end of the subword we just found
            start += len(best_subword)
        
        output_tokens.extend(word_tokens)
        
    return output_tokens

# =============================================================================
# 3. MAIN EXECUTION
# =============================================================================

if __name__ == "__main__":
    
    # --- 1. Load Data ---
    print(f"Loading first {NUM_SAMPLES_FOR_TRAINING} lines from {LOCAL_DATA_FILE_PATH}...")
    corpus = []
    try:
        with open(LOCAL_DATA_FILE_PATH, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= NUM_SAMPLES_FOR_TRAINING:
                    break
                # .strip() removes leading/trailing whitespace and newlines
                corpus.append(line.strip())
                
    except FileNotFoundError:
        print(f"Error: The file '{LOCAL_DATA_FILE_PATH}' was not found.")
        print("Please update the LOCAL_DATA_FILE_PATH variable at the top of the script.")
        exit()
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        exit()

    print(f"Loaded {len(corpus)} lines.")
    if not corpus:
        print("Corpus is empty! Check your file and file path. Exiting.")
        exit()

    # --- 2. Train BPE ---
    print("\n" + "="*30)
    print("--- Starting BPE Training ---")
    print(f"Target merges: {NUM_MERGES_BPE}")
    print("="*30)
    start_time = time.time()
    bpe_vocab, bpe_rules = train_bpe(corpus, NUM_MERGES_BPE)
    end_time = time.time()
    print(f"BPE training took {end_time - start_time:.2f} seconds.")

    # --- 3. Train WordPiece ---
    print("\n" + "="*30)
    print("--- Starting WordPiece Training ---")
    print(f"Target vocab size: {TARGET_VOCAB_SIZE_WP}")
    print("="*30)
    start_time = time.time()
    wp_vocab = train_wordpiece(corpus, TARGET_VOCAB_SIZE_WP)
    end_time = time.time()
    print(f"WordPiece training took {end_time - start_time:.2f} seconds.")


    # --- 4. Test Tokenization ---
    test_sentence = "ркЧрлБркЬрк░рк╛ркдркирлБркВ рк╕рлМркерлА ркорлЛркЯрлБркВ рк╢рк╣рлЗрк░ ркЕркоркжрк╛рк╡рк╛ркж ркЫрлЗ." # "Ahmedabad is the largest city in Gujarat."
    print(f"\nTest Sentence: {test_sentence}")

    # Test BPE
    print("\n--- BPE Tokenization Test ---")
    bpe_tokens = tokenize_bpe(test_sentence, bpe_rules)
    print(f"BPE Tokens: {bpe_tokens}")

    # Test WordPiece
    print("\n--- WordPiece Tokenization Test ---")
    wp_tokens = tokenize_wordpiece(test_sentence, wp_vocab)
    print(f"WordPiece Tokens: {wp_tokens}")

    # --- 5. Save Vocabularies (Optional but Recommended) ---
    print("\nSaving vocab files...")
    
    # Save BPE vocab and rules
    with open("bpe_vocab.txt", "w", encoding="utf-8") as f:
        for token in sorted(list(bpe_vocab)):
            f.write(token + "\n")
            
    with open("bpe_merges.txt", "w", encoding="utf-8") as f:
        for pair in bpe_rules:
            f.write(f"{pair[0]} {pair[1]}\n")

    # Save WordPiece vocab
    with open("wordpiece_vocab.txt", "w", encoding="utf-8") as f:
        # WordPiece vocabs are often sorted by length, then alphabetically
        # But for this from-scratch version, alphabetical is fine.
        for token in sorted(list(wp_vocab)):
            f.write(token + "\n")
            
    print("Saved 'bpe_vocab.txt', 'bpe_merges.txt', and 'wordpiece_vocab.txt'")

Loading first 10000 lines from /Users/adityakumar/Desktop/college_labs/NLP/assignment 9/gu.txt...
Loaded 10000 lines.

--- Starting BPE Training ---
Target merges: 1000
BPE: Getting word counts...
BPE: Initial vocab size: 195
BPE: Merge step 1/1000 - Best pair: ('рк╛', '</w>') -> рк╛</w>
BPE: Merge step 100/1000 - Best pair: ('ркЕ', '</w>') -> ркЕ</w>
BPE: Merge step 200/1000 - Best pair: ('ркШ', 'рк░</w>') -> ркШрк░</w>
BPE: Merge step 300/1000 - Best pair: ('ркП', 'рк╕') -> ркПрк╕
BPE: Merge step 400/1000 - Best pair: ('ркЙ', 'рк▓</w>') -> ркЙрк▓</w>
BPE: Merge step 500/1000 - Best pair: ('ркЯ', 'рк▓рко</w>') -> ркЯрк▓рко</w>
BPE: Merge step 600/1000 - Best pair: ('рк╖', 'рке</w>') -> рк╖рке</w>
BPE: Merge step 700/1000 - Best pair: ('ркП', 'ркХрко</w>') -> ркПркХрко</w>
BPE: Merge step 800/1000 - Best pair: ('1', '8') -> 18
BPE: Merge step 900/1000 - Best pair: ('8', '0</w>') -> 80</w>
BPE: Merge step 1000/1000 - Best pair: ('i', 'n</w>') -> in</w>
BPE: Training complete.
BPE traini

<h1>on 1lakh sentences

In [4]:
import re
import collections
import time

# --- тЪЩя╕П Settings ---
# ЁЯСЗ **Double-check this path is correct**
LOCAL_DATA_FILE_PATH = "/Users/adityakumar/Desktop/college_labs/NLP/assignment 9/gu.txt" 

# --- Settings for 1 Lakh (1 Hour) Run ---
# This will run on 100,000 lines and perform 32,000 merges.
# Expect it to take ~1 hour.
NUM_SAMPLES_FOR_TRAINING = 100000  # 1 Lakh lines
NUM_MERGES_BPE = 32000
TARGET_VOCAB_SIZE_WP = 32000
# -----------------------------------


# =============================================================================
# 1. BYTE PAIR ENCODING (BPE)
# =============================================================================

def get_word_counts_bpe(corpus):
    """
    Pre-tokenizes a raw text corpus into a dictionary of word counts.
    Adds a space between characters and an end-of-word token.
    """
    word_counts = collections.Counter()
    for text in corpus:
        # This regex handles words (including Indic scripts) and punctuation
        words = re.findall(r'\w+|[^\w\s]', text)
        for word in words:
            # Add the end-of-word token </w>
            word_counts[' '.join(list(word)) + ' </w>'] += 1
    return word_counts

def get_pairs_bpe(word_counts):
    """
    Finds the frequency of all adjacent token pairs in the vocabulary.
    """
    pairs = collections.Counter()
    for word, count in word_counts.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i+1])] += count
    return pairs

def merge_pair_bpe(best_pair, word_counts):
    """
    Merges the most frequent pair in all words of the vocabulary.
    """
    new_word_counts = collections.Counter()
    new_token = ''.join(best_pair)
    
    # We need a regex to safely replace the pair
    # (?<!\S) and (?!\S) are negative look-behind/ahead for whitespace
    # This ensures we only merge whole tokens.
    pattern = r'(?<!\S)' + re.escape(best_pair[0]) + r'\s+' + re.escape(best_pair[1]) + r'(?!\S)'
    
    for word, count in word_counts.items():
        # Replace all occurrences of the pair with the new merged token
        new_word = re.sub(pattern, new_token, word)
        new_word_counts[new_word] += count
        
    return new_word_counts

def train_bpe(corpus, num_merges):
    """
    Trains a BPE tokenizer from a corpus.
    """
    print("BPE: Getting word counts...")
    # 1. Get word frequencies
    word_counts = get_word_counts_bpe(corpus)
    
    # 2. Get initial vocabulary (all unique characters)
    vocab = set()
    for word in word_counts:
        vocab.update(word.split())
    print(f"BPE: Initial vocab size: {len(vocab)}")
        
    merge_rules = []
    
    # 3. Iterate for num_merges
    for i in range(num_merges):
        # 3a. Get all adjacent pairs
        pairs = get_pairs_bpe(word_counts)
        
        if not pairs:
            print(f"BPE: No more pairs to merge. Stopped at step {i+1}.")
            break
            
        # 3b. Find the most frequent pair
        best_pair = max(pairs, key=pairs.get)
        
        # 3c. Merge the pair and add to vocab
        new_token = ''.join(best_pair)
        vocab.add(new_token)
        merge_rules.append(best_pair)
        
        # 3d. Update all word representations
        word_counts = merge_pair_bpe(best_pair, word_counts)
        
        if (i + 1) % 500 == 0 or i == 0:
            print(f"BPE: Merge step {i+1}/{num_merges} - Best pair: {best_pair} -> {new_token}")

    print("BPE: Training complete.")
    return vocab, merge_rules

def tokenize_bpe(text, merge_rules):
    """
    Tokenizes new text using the learned BPE merge rules.
    """
    # 1. Pre-tokenize into words (using the same regex as training)
    words = re.findall(r'\w+|[^\w\s]', text)
    
    tokenized_output = []
    for word in words:
        # 2. Represent word as chars + </w>
        tokens = list(word) + ['</w>']
        
        # 3. Apply all merge rules in order
        for pair in merge_rules:
            new_tokens = []
            i = 0
            while i < len(tokens):
                # Check if the current and next token form the pair
                if i < len(tokens) - 1 and (tokens[i], tokens[i+1]) == pair:
                    new_tokens.append(''.join(pair))
                    i += 2
                else:
                    new_tokens.append(tokens[i])
                    i += 1
            tokens = new_tokens
            
        tokenized_output.extend(tokens)
        
    return tokenized_output

# =============================================================================
# 2. WORDPIECE
# =============================================================================

def get_word_counts_wp(corpus):
    """
    Pre-tokenizes corpus into word counts.
    WordPiece splits words into characters.
    """
    word_counts = collections.Counter()
    for text in corpus:
        words = re.findall(r'\w+|[^\w\s]', text)
        for word in words:
            # We just split into characters. No '</w>'
            word_counts[' '.join(list(word))] += 1
    return word_counts

def get_stats_wp(word_counts):
    """
    Gets counts for both individual tokens and adjacent pairs.
    """
    pairs = collections.Counter()
    token_counts = collections.Counter()
    
    for word, count in word_counts.items():
        symbols = word.split()
        for i in range(len(symbols)):
            token_counts[symbols[i]] += count
            if i < len(symbols) - 1:
                pairs[(symbols[i], symbols[i+1])] += count
    return pairs, token_counts

def merge_pair_wp(best_pair, word_counts):
    """
    Merges the best pair in all words (same as BPE's merge).
    """
    new_word_counts = collections.Counter()
    new_token = ''.join(best_pair)
    
    pattern = r'(?<!\S)' + re.escape(best_pair[0]) + r'\s+' + re.escape(best_pair[1]) + r'(?!\S)'
    
    for word, count in word_counts.items():
        new_word = re.sub(pattern, new_token, word)
        new_word_counts[new_word] += count
        
    return new_word_counts

def train_wordpiece(corpus, vocab_size):
    """
    Trains a WordPiece tokenizer from a corpus.
    """
    print("WordPiece: Getting word counts...")
    # 1. Get word frequencies
    word_counts = get_word_counts_wp(corpus)
    
    # 2. Get initial vocabulary (all unique characters)
    vocab = set()
    for word in word_counts:
        vocab.update(word.split())
    
    # Add special tokens
    special_tokens = ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']
    for token in special_tokens:
        if token not in vocab:
            vocab.add(token)
            
    print(f"WordPiece: Initial vocab size: {len(vocab)}")
    
    # We iterate until we reach the target vocab size
    num_merges = vocab_size - len(vocab)
    if num_merges <= 0:
        print(f"WordPiece: Target vocab size ({vocab_size}) is smaller than initial vocab size ({len(vocab)}). No merges needed.")
        return vocab
    
    print(f"WordPiece: Will perform {num_merges} merges.")
    
    for i in range(num_merges):
        # 3a. Get pair and individual token counts
        pairs, token_counts = get_stats_wp(word_counts)
        
        if not pairs:
            print(f"WordPiece: No more pairs to merge. Stopped at merge {i+1}.")
            break

        # 3b. Find the best pair based on likelihood score
        best_pair = ('', '')
        max_score = -1.0
        
        for pair, count in pairs.items():
            token_a, token_b = pair
            # Ensure we don't divide by zero
            if token_counts[token_a] > 0 and token_counts[token_b] > 0:
                # Score = freq(pair) / (freq(A) * freq(B))
                score = count / (token_counts[token_a] * token_counts[token_b])
            else:
                score = 0.0
            
            if score > max_score:
                max_score = score
                best_pair = pair
        
        # *** THIS IS THE CORRECTED LINE ***
        if max_score == -1.0: 
            print(f"WordPiece: No valid pairs found (max_score = -1). Stopped at merge {i+1}.")
            break

        # 3c. Merge the pair and add to vocab
        new_token = ''.join(best_pair)
        vocab.add(new_token)
        
        # 3d. Update all word representations
        word_counts = merge_pair_wp(best_pair, word_counts)
        
        if (i + 1) % 500 == 0 or i == 0:
            print(f"WordPiece: Merge {i+1}/{num_merges} - Vocab size: {len(vocab)} - Best pair: {best_pair} -> {new_token}")
            
    print("WordPiece: Training complete.")
    return vocab

def tokenize_wordpiece(text, vocab, unk_token="[UNK]"):
    """
    Tokenizes new text using the learned WordPiece vocabulary.
    Uses a greedy, longest-match-first approach.
    """
    
    # 1. Pre-tokenize into words
    words = re.findall(r'\w+|[^\w\s]', text)
    
    output_tokens = []
    
    for word in words:
        # If the whole word is in our vocab, great!
        if word in vocab:
            output_tokens.append(word)
            continue
            
        word_tokens = []
        start = 0
        while start < len(word):
            # Find the longest subword in vocab that matches from `start`
            end = len(word)
            best_subword = None
            
            # Greedily check from longest possible (end) to shortest (start+1)
            while end > start:
                sub = word[start:end]
                # Note: WordPiece vocab doesn't store '##' prefixes
                if sub in vocab:
                    best_subword = sub
                    break # Found the longest valid subword
                end -= 1
            
            # Case 1: No subword found (not even a single character)
            if best_subword is None:
                # This means the character at word[start] is not in our vocab
                # This is a true "unknown"
                word_tokens = [unk_token] # The whole word is un-tokenizable
                break # Stop processing this word
            
            # Case 2: We found a subword
            if start == 0:
                # It's the first piece of the word
                word_tokens.append(best_subword)
            else:
                # It's a subsequent piece, add the '##' prefix
                word_tokens.append("##" + best_subword)
            
            # Move our `start` pointer to the end of the subword we just found
            start += len(best_subword)
        
        output_tokens.extend(word_tokens)
        
    return output_tokens

# =============================================================================
# 3. MAIN EXECUTION
# =============================================================================

if __name__ == "__main__":
    
    # --- 1. Load Data ---
    print(f"Loading first {NUM_SAMPLES_FOR_TRAINING} lines from {LOCAL_DATA_FILE_PATH}...")
    corpus = []
    try:
        with open(LOCAL_DATA_FILE_PATH, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= NUM_SAMPLES_FOR_TRAINING:
                    break
                # .strip() removes leading/trailing whitespace and newlines
                corpus.append(line.strip())
                
    except FileNotFoundError:
        print(f"Error: The file '{LOCAL_DATA_FILE_PATH}' was not found.")
        print("Please update the LOCAL_DATA_FILE_PATH variable at the top of the script.")
        exit()
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        exit()

    print(f"Loaded {len(corpus)} lines.")
    if not corpus:
        print("Corpus is empty! Check your file and file path. Exiting.")
        exit()

    # --- 2. Train BPE ---
    print("\n" + "="*30)
    print("--- Starting BPE Training ---")
    print(f"Target merges: {NUM_MERGES_BPE}")
    print("="*30)
    start_time = time.time()
    bpe_vocab, bpe_rules = train_bpe(corpus, NUM_MERGES_BPE)
    end_time = time.time()
    print(f"BPE training took {end_time - start_time:.2f} seconds.")

    # --- 3. Train WordPiece ---
    print("\n" + "="*30)
    print("--- Starting WordPiece Training ---")
    print(f"Target vocab size: {TARGET_VOCAB_SIZE_WP}")
    print("="*30)
    start_time = time.time()
    wp_vocab = train_wordpiece(corpus, TARGET_VOCAB_SIZE_WP)
    end_time = time.time()
    print(f"WordPiece training took {end_time - start_time:.2f} seconds.")


    # --- 4. Test Tokenization ---
    test_sentence = "ркЧрлБркЬрк░рк╛ркдркирлБркВ рк╕рлМркерлА ркорлЛркЯрлБркВ рк╢рк╣рлЗрк░ ркЕркоркжрк╛рк╡рк╛ркж ркЫрлЗ." # "Ahmedabad is the largest city in Gujarat."
    print(f"\nTest Sentence: {test_sentence}")

    # Test BPE
    print("\n--- BPE Tokenization Test ---")
    bpe_tokens = tokenize_bpe(test_sentence, bpe_rules)
    print(f"BPE Tokens: {bpe_tokens}")

    # Test WordPiece
    print("\n--- WordPiece Tokenization Test ---")
    wp_tokens = tokenize_wordpiece(test_sentence, wp_vocab)
    print(f"WordPiece Tokens: {wp_tokens}")

    # --- 5. Save Vocabularies (Optional but Recommended) ---
    print("\nSaving vocab files for 32k run...")
    
    # Save BPE vocab and rules
    with open("bpe_vocab_32k.txt", "w", encoding="utf-8") as f:
        for token in sorted(list(bpe_vocab)):
            f.write(token + "\n")
            
    with open("bpe_merges_32k.txt", "w", encoding="utf-8") as f:
        for pair in bpe_rules:
            f.write(f"{pair[0]} {pair[1]}\n")

    # Save WordPiece vocab
    with open("wordpiece_vocab_32k.txt", "w", encoding="utf-8") as f:
        for token in sorted(list(wp_vocab)):
            f.write(token + "\n")
            
    print("Saved 'bpe_vocab_32k.txt', 'bpe_merges_32k.txt', and 'wordpiece_vocab_32k.txt'")

Loading first 100000 lines from /Users/adityakumar/Desktop/college_labs/NLP/assignment 9/gu.txt...
Loaded 100000 lines.

--- Starting BPE Training ---
Target merges: 32000
BPE: Getting word counts...
BPE: Initial vocab size: 339
BPE: Merge step 1/32000 - Best pair: ('рк╛', '</w>') -> рк╛</w>
BPE: Merge step 500/32000 - Best pair: ('рк╕рко', 'ркпрко</w>') -> рк╕ркоркпрко</w>
BPE: Merge step 1000/32000 - Best pair: ('ркн', 'ркЯ</w>') -> ркнркЯ</w>
BPE: Merge step 1500/32000 - Best pair: ('ркЖ', 'ркЬрк░</w>') -> ркЖркЬрк░</w>
BPE: Merge step 2000/32000 - Best pair: ('рк╡', 'ркиркЧрк░рки</w>') -> рк╡ркиркЧрк░рки</w>
BPE: Merge step 2500/32000 - Best pair: ('ркЯ', 'рк╕рки</w>') -> ркЯрк╕рки</w>
BPE: Merge step 3000/32000 - Best pair: ('ркп', 'ркХрк╡</w>') -> ркпркХрк╡</w>
BPE: Merge step 3500/32000 - Best pair: ('en', 'd') -> end
BPE: Merge step 4000/32000 - Best pair: ('ркЕркЧ', 'рк╡ркбркд</w>') -> ркЕркЧрк╡ркбркд</w>
BPE: Merge step 4500/32000 - Best pair: ('3', '60</w>') -> 360</w>
BPE: 