In [7]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


## Using Pretrained Tokenizers (GPT-2 and BERT)


### Purpose
Demonstrates how real-world tokenizers (GPT-2 and BERT) handle tokenization and normalization, providing a baseline for comparison with the custom BPE implementation.

### Functions

#### `demonstrate_pretrained_tokenizers()`

#### Tokenization (GPT-2)
Uses BPE to split text into tokens, showing subword units for rare words (e.g., videoconference â†’ vide, ocon, ference).

#### Encoding
Converts tokens to vocabulary IDs, a step used in NLP models.

#### Normalization (BERT)
Applies case folding (lowercase), aligning with Section 2.6.

#### Pre-Tokenization
Shows how text is split into words before BPE or WordPiece, as described in Section 2.5.2.

### Relation to Chapter

#### Section 2.5.2 (BPE)
GPT-2's tokenizer uses BPE, breaking words into subwords to handle unknown words, as shown with videoconference.

#### Section 2.6 (Normalization)
BERT's case folding (Hello â†’ hello) demonstrates normalization for generalization, as discussed in the chapter.

#### Section 2.7 (Sentence Segmentation)
Implicitly handled by treating punctuation (., ,) as separate tokens, aligning with the chapter's discussion of punctuation as boundary markers.

In [36]:
# Import required libraries
from transformers import AutoTokenizer
from collections import defaultdict



In [37]:
# Define a sample corpus for BPE training
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
    "Thisit"
]

In [42]:
# Load GPT-2 tokenizer (uses BPE)
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [43]:
# Define a test prompt
prompt = "Hello, this is John Doe. I am doing a videoconference from my  office"

In [44]:
# Tokenize the prompt using GPT-2
gpt2_tokens = gpt2_tokenizer.tokenize(prompt)
print("GPT-2 Tokenization:", gpt2_tokens)

GPT-2 Tokenization: ['Hello', ',', 'Ä this', 'Ä is', 'Ä John', 'Ä Doe', '.', 'Ä I', 'Ä am', 'Ä doing', 'Ä a', 'Ä vide', 'ocon', 'ference', 'Ä from', 'Ä my', 'Ä ', 'Ä office']


Note: 'Ä ' denotes a space (GPT-2 convention); 'videoconference' is split into subwords ('vide', 'ocon', 'ference')

In [45]:
# Encode tokens to vocabulary IDs (returns PyTorch tensor)
gpt2_encoded = gpt2_tokenizer.encode(prompt, return_tensors="pt")
print("GPT-2 Encoded IDs:", gpt2_encoded)

GPT-2 Encoded IDs: tensor([[15496,    11,   428,   318,  1757, 31780,    13,   314,   716,  1804,
           257, 18784, 36221,  4288,   422,   616,   220,  2607]])


In [46]:
# Load BERT tokenizer (uses WordPiece, similar to BPE)
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [47]:
# Normalize the prompt using BERT's normalizer (case folding)
normalized_prompt = bert_tokenizer.backend_tokenizer.normalizer.normalize_str(prompt)
print("BERT Normalized Prompt:", normalized_prompt)

BERT Normalized Prompt: hello, this is john doe. i am doing a videoconference from my  office


Note: Converts to lowercase, demonstrating case folding (Section 2.6)

In [48]:
# Inspect normalizer methods for GPT-2 and BERT
print("GPT-2 Normalizer Methods:", [i for i in dir(gpt2_tokenizer.backend_tokenizer.normalizer)])
print("BERT Normalizer Methods:", [i for i in dir(bert_tokenizer.backend_tokenizer.normalizer)])

GPT-2 Normalizer Methods: ['__bool__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']
BERT Normalizer Methods: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', 'clean_text', 'custom', 'handle_chinese_chars', 'lowercase', 'normalize', 'normalize_str', 'strip_accents']


Note: BERT includes 'lowercase', 'strip_accents', etc., showing advanced normalization (Section 2.6)
    

In [49]:
# Pre-tokenize the prompt using BERT and GPT-2
bert_pre_tokens = bert_tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(prompt)
gpt2_pre_tokens = gpt2_tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(prompt)
print("BERT Pre-Tokenization:", bert_pre_tokens)
print("GPT-2 Pre-Tokenization:", gpt2_pre_tokens)
# Output: Lists of (word, (start, end)) tuples, splitting on spaces/punctuation

BERT Pre-Tokenization: [('Hello', (0, 5)), (',', (5, 6)), ('this', (7, 11)), ('is', (12, 14)), ('John', (15, 19)), ('Doe', (20, 23)), ('.', (23, 24)), ('I', (25, 26)), ('am', (27, 29)), ('doing', (30, 35)), ('a', (36, 37)), ('videoconference', (38, 53)), ('from', (54, 58)), ('my', (59, 61)), ('office', (63, 69))]
GPT-2 Pre-Tokenization: [('Hello', (0, 5)), (',', (5, 6)), ('Ä this', (6, 11)), ('Ä is', (11, 14)), ('Ä John', (14, 19)), ('Ä Doe', (19, 23)), ('.', (23, 24)), ('Ä I', (24, 26)), ('Ä am', (26, 29)), ('Ä doing', (29, 35)), ('Ä a', (35, 37)), ('Ä videoconference', (37, 53)), ('Ä from', (53, 58)), ('Ä my', (58, 61)), ('Ä ', (61, 62)), ('Ä office', (62, 69))]


In [50]:
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

In [51]:
text = """
Contact me at crazy_coder99@gmail.com or visit my profile at http://example.com!
Iâ€™ve been running, RUNS!! He ran quicklyâ€”faster than anyone! #speedster #PythonRocks ðŸš€ðŸš€ðŸš€
"""

# STEP 1: REGULAR EXPRESSION - Extract emails and hashtags
emails = re.findall(r'\b[\w.-]+?@\w+?\.\w+?\b', text)
hashtags = re.findall(r'#\w+', text)

print("Extracted Emails:", emails)
print("Extracted Hashtags:", hashtags)

Extracted Emails: ['crazy_coder99@gmail.com']
Extracted Hashtags: ['#speedster', '#PythonRocks']


In [52]:
# STEP 2: TOKENIZATION - Sentence and Word Tokenization
sentences = sent_tokenize(text)
words = word_tokenize(text)

word_tokenize implements rule-based word tokenization, splitting on whitespace and punctuation, unlike BPEâ€™s subword approach



In [53]:
sentences

['\nContact me at crazy_coder99@gmail.com or visit my profile at http://example.com!',
 'Iâ€™ve been running, RUNS!!',
 'He ran quicklyâ€”faster than anyone!',
 '#speedster #PythonRocks ðŸš€ðŸš€ðŸš€']

In [54]:
words

['Contact',
 'me',
 'at',
 'crazy_coder99',
 '@',
 'gmail.com',
 'or',
 'visit',
 'my',
 'profile',
 'at',
 'http',
 ':',
 '//example.com',
 '!',
 'I',
 'â€™',
 've',
 'been',
 'running',
 ',',
 'RUNS',
 '!',
 '!',
 'He',
 'ran',
 'quicklyâ€”faster',
 'than',
 'anyone',
 '!',
 '#',
 'speedster',
 '#',
 'PythonRocks',
 'ðŸš€ðŸš€ðŸš€']

In [None]:
# Applies case folding (lowercase) and filters out punctuation.Output: Includes emails, URLs, and emojis, but removes commas, periods, etc.

normalized_words = [word.lower() for word in words if word not in string.punctuation]
normalized_words

['contact',
 'me',
 'at',
 'crazy_coder99',
 'gmail.com',
 'or',
 'visit',
 'my',
 'profile',
 'at',
 'http',
 '//example.com',
 'i',
 'â€™',
 've',
 'been',
 'running',
 'runs',
 'he',
 'ran',
 'quicklyâ€”faster',
 'than',
 'anyone',
 'speedster',
 'pythonrocks',
 'ðŸš€ðŸš€ðŸš€']

In [56]:
# Stemming and Lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stems = [stemmer.stem(word) for word in normalized_words]
lemmas = [lemmatizer.lemmatize(word, pos='v') for word in normalized_words]

print("\nLowercased Words:", normalized_words)
print("\nStemmed Words:", stems)
print("\nLemmatized Words:", lemmas)



Lowercased Words: ['contact', 'me', 'at', 'crazy_coder99', 'gmail.com', 'or', 'visit', 'my', 'profile', 'at', 'http', '//example.com', 'i', 'â€™', 've', 'been', 'running', 'runs', 'he', 'ran', 'quicklyâ€”faster', 'than', 'anyone', 'speedster', 'pythonrocks', 'ðŸš€ðŸš€ðŸš€']

Stemmed Words: ['contact', 'me', 'at', 'crazy_coder99', 'gmail.com', 'or', 'visit', 'my', 'profil', 'at', 'http', '//example.com', 'i', 'â€™', 've', 'been', 'run', 'run', 'he', 'ran', 'quicklyâ€”fast', 'than', 'anyon', 'speedster', 'pythonrock', 'ðŸš€ðŸš€ðŸš€']

Lemmatized Words: ['contact', 'me', 'at', 'crazy_coder99', 'gmail.com', 'or', 'visit', 'my', 'profile', 'at', 'http', '//example.com', 'i', 'â€™', 've', 'be', 'run', 'run', 'he', 'run', 'quicklyâ€”faster', 'than', 'anyone', 'speedster', 'pythonrocks', 'ðŸš€ðŸš€ðŸš€']


## Edit Distance

### Spelling Correction and Alignment

#### Step 1: Spelling Correction

#### Function: `correct_spelling`
Uses edit_distance to find the candidate word with the minimum edit distance to a misspelled word (e.g., Thisit â†’ This it).

#### Relation to BPE Code
Corrects Thisit from the BPE corpus, which could be a typo.

#### Relation to Chapter
Implements Section 2.8's spelling correction example (e.g., graffe â†’ giraffe).

#### Step 2: Alignment

#### Function: `get_edit_alignment`
Uses edit_distance_align to compute the edit distance and alignment, visualizing operations (insertions, deletions, substitutions) as in Fig. 2.14.

#### Relation to BPE Code
Shows how Thisit transforms to This it (insert space).

#### Relation to Chapter
Matches the chapter's alignment for intention â†’ execution.

In [59]:
from nltk.metrics.distance import edit_distance, edit_distance_align

In [60]:
nltk.download('punkt')  # For word_tokenize

[nltk_data] Downloading package punkt to /home/musty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [73]:
edit_distance("Thisit", "This it", substitution_cost=2)

1

In [61]:
# --- Step 1: Minimum Edit Distance for Spelling Correction ---
def correct_spelling(word, candidates, substitution_cost=1):
    """
    Corrects a misspelled word by finding the candidate with minimum edit distance.
    Relates to Section 2.8 (Minimum Edit Distance) of the book.

    Args:
        word (str): The potentially misspelled word.
        candidates (list): List of correct words to compare against.
        substitution_cost (int): Cost of substitution (1 for standard Levenshtein, 2 for alternative).

    Returns:
        tuple: Corrected word and its edit distance.
    """
    distances = [(candidate, edit_distance(word, candidate, substitution_cost=substitution_cost))
                 for candidate in candidates]
    return min(distances, key=lambda x: x[1])

In [69]:
candidates = ["This it", "This", "That", "Thus"]
misspelled = "Thisit"
corrected_word, distance = correct_spelling(misspelled, candidates, substitution_cost=2)
print(f"Spelling Correction for '{misspelled}':")
print(f"Corrected Word: {corrected_word}, Edit Distance: {distance}")

Spelling Correction for 'Thisit':
Corrected Word: This it, Edit Distance: 1


In [74]:
# --- Step 2: Alignment for Visualization ---
def get_edit_alignment(source, target, substitution_cost=2):
    """
    Computes the edit distance and alignment between two strings.
    Relates to Section 2.8 (Alignment for Minimum Edit Distance).

    Args:
        source (str): Source string.
        target (str): Target string.
        substitution_cost (int): Cost of substitution.

    Returns:
        tuple: Edit distance and list of alignment operations.
    """
    distance = edit_distance(source, target, substitution_cost=substitution_cost)
    alignment = edit_distance_align(source, target, substitution_cost=substitution_cost)
    return distance, alignment

In [75]:
# Example: Align 'Thisit' with 'This it'
distance, alignment = get_edit_alignment("Thisit", "This it", substitution_cost=2)
print(f"\nAlignment for 'Thisit' -> 'This it':")
print(f"Edit Distance: {distance}")
print(f"Alignment: {alignment}")


Alignment for 'Thisit' -> 'This it':
Edit Distance: 1
Alignment: [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5), (5, 6), (6, 7)]


Note: Alignment shows character mappings; (6, 7) indicates inserting a space.