### Jupyter Notebook Settings

In [None]:
from IPython.core.display import display, HTML                                    
display(HTML("<style>.container { width:100% !important; }</style>"))  
import IPython.display as display

### Libraries

In [None]:
import os
import re

### Script to generate C++ code for BPE vocabulary with <w> suffix removed and special characters escaped

In [None]:
# Specify the path to your cleaned BPE vocabulary file
vocab_file_path = "bpe_vocab_cleaned.txt"

# Output file where the generated C++ code will be saved
output_file_path = "bpe_vocabulary.cpp"

# Function to escape special characters in a string for C++ string literals
def escape_special_characters(token):
    # Escape backslashes, double quotes, and other special characters
    token = token.replace('\\', '\\\\')
    token = token.replace('\"', '\\\"')
    token = token.replace('\'', '\\\'')
    token = token.replace('\n', '\\n')
    token = token.replace('\r', '\\r')
    token = token.replace('\t', '\\t')
    token = token.replace('<space>', ' ')
    return token

# Function to generate the C++ code
def generate_cpp_vocabulary(vocab_file_path, output_file_path):
    with open(vocab_file_path, "r") as vocab_file, open(output_file_path, "w") as output_file:
        # Start the C++ array declaration
        output_file.write('const char* const bpeVocabulary[] = {\n')

        # Read the vocabulary file and format each line
        for line in vocab_file:
            token = line.strip()
            # Remove the </w> suffix if present
            if token.endswith("</w>"):
                token = token[:-4]
            # Escape special characters
            token = escape_special_characters(token)
            output_file.write(f'    "{token}",\n')

        # Close the array declaration
        output_file.write('};\n')

        # Add the size of the vocabulary
        output_file.write(f'\nconst int numTokens = {sum(1 for _ in open(vocab_file_path))};\n')

# Run the function to generate the C++ code
generate_cpp_vocabulary(vocab_file_path, output_file_path)

print(f"C++ vocabulary file generated: {output_file_path}")