# Training a new tokenizer from an old one

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for training tokenizers and working with datasets
!uv pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

In [None]:
# Configure Git credentials for pushing to Hugging Face Hub (required for model sharing)
!git config --global user.email "you@example.com"
!git config --global user.name "Your Name"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
# Login to Hugging Face Hub to enable pushing models/tokenizers
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Load the CodeSearchNet dataset - contains Python code functions for training
# This dataset will be used to create a specialized tokenizer for code
from datasets import load_dataset

# This can take a few minutes to load, so grab a coffee or tea while you wait!
raw_datasets = load_dataset("code_search_net", "python")

In [None]:
# Examine the structure of our training dataset
# Shows features available and the number of Python function examples
raw_datasets["train"]

In [None]:
# Look at an example function from the dataset
# The 'whole_func_string' contains complete Python functions with docstrings
print(raw_datasets["train"][123456]["whole_func_string"])

In [None]:
# WARNING: Don't load entire dataset into memory at once!
# This would create a list containing ALL function strings, consuming too much RAM
# Don't uncomment the following line unless your dataset is small!
# training_corpus = [raw_datasets["train"][i: i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000)]

In [None]:
# Create a generator to efficiently stream data in batches of 1000 functions
# This avoids loading the entire dataset into memory at once
training_corpus = (
    raw_datasets["train"][i : i + 1000]["whole_func_string"]
    for i in range(0, len(raw_datasets["train"]), 1000)
)

In [None]:
# Demonstration: Generators can only be consumed once!
# After iterating through a generator, it's exhausted and returns empty
gen = (i for i in range(10))
print(list(gen))  # First iteration: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
print(list(gen))  # Second iteration: [] (empty!)

In [None]:
# Better approach: Create a function that returns a fresh generator each time
# This allows us to iterate through the data multiple times during training
def get_training_corpus():
    return (
        raw_datasets["train"][i : i + 1000]["whole_func_string"]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )


training_corpus = get_training_corpus()

In [None]:
# Alternative implementation: More explicit generator function
# Yields batches of function strings for tokenizer training
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["whole_func_string"]

In [None]:
# Load the base GPT-2 tokenizer that we'll adapt for Python code
# GPT-2 was trained on general text, not code, so it's not optimal for programming languages
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
# Test how the original GPT-2 tokenizer handles Python code
# Notice how it splits "numbers" into multiple tokens and handles indentation poorly
example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

tokens = old_tokenizer.tokenize(example)
tokens

In [None]:
# Train a new tokenizer specialized for Python code
# 52000 is the new vocabulary size - larger than GPT-2's 50257 to include more code-specific tokens
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

In [None]:
# Test the new tokenizer on the same Python code example
# Notice improved tokenization: "numbers" is now a single token, better indentation handling
tokens = tokenizer.tokenize(example)
tokens

In [None]:
# Compare token efficiency: new tokenizer uses fewer tokens for the same code
# Fewer tokens = more efficient processing and better context understanding
print(len(tokens))  # New tokenizer: 27 tokens
print(len(old_tokenizer.tokenize(example)))  # Old tokenizer: 36 tokens

In [None]:
# Test on a more complex Python class example
# Notice how the new tokenizer better handles Python-specific patterns like class definitions,
# method names (__init__, __call__), and common libraries (torch)
example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias
    """
tokenizer.tokenize(example)

In [None]:
# Save the new tokenizer locally for future use
# This creates a folder with all necessary tokenizer files
tokenizer.save_pretrained("code-search-net-tokenizer")

In [None]:
# Login again if needed before pushing to Hub
# Some authentication tokens may have expired during the training process
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Upload the tokenizer to Hugging Face Hub for sharing and reuse
# This makes it available for others to download and use in their projects
tokenizer.push_to_hub("code-search-net-tokenizer")

In [None]:
# Load the tokenizer from Hugging Face Hub to verify it was uploaded correctly
# Replace "huggingface-course" with your actual namespace to use your own tokenizer
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")