#Exploring tokenizers
Copyright 2023-2024, Denis Rothman

**September 02, 2024 update**
-Removed token from GitHub retrieval function
-Improved Roberta-base tokenizer download




#Installing libraries

In [2]:
#Hugging Face Transformers
!pip install transformers



In [3]:
#Printing tabular data in Python
!pip install tabulate



In [4]:
#Natural Language Toolkit
!pip install nltk



# Word and sentence tokenizers

In [5]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer, \
    TreebankWordTokenizer, WhitespaceTokenizer, PunktSentenceTokenizer, \
    WordPunctTokenizer, MWETokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Sentence Tokenization

In [6]:
# Sentence Tokenization
text = "This is a sentence. This is another one."
sentences = sent_tokenize(text)
print("Sentence Tokenization:")
print(sentences)
print()

Sentence Tokenization:
['This is a sentence.', 'This is another one.']



## Word Tokenization

In [7]:
# Word Tokenization
sentence = "This sentence contains several words."
words = word_tokenize(sentence)
print("Word Tokenization:")
print(words)
print()

Word Tokenization:
['This', 'sentence', 'contains', 'several', 'words', '.']



## Regular Expression Tokenization

In [8]:
# Regular Expression Tokenization
tokenizer = RegexpTokenizer(r'\w+')
text = "Let's see how to tokenize a sentence."
tokens = tokenizer.tokenize(text)
print("Regular Expression Tokenization:")
print(tokens)
print()

Regular Expression Tokenization:
['Let', 's', 'see', 'how', 'to', 'tokenize', 'a', 'sentence']



## Treebank Tokenization

In [9]:
# Treebank Tokenization
tokenizer = TreebankWordTokenizer()
text = "There aren't that many tokenizers."
tokens = tokenizer.tokenize(text)
print("Treebank Tokenization:")
print(tokens)
print()

Treebank Tokenization:
['There', 'are', "n't", 'that', 'many', 'tokenizers', '.']



## White Space Tokenization

In [10]:
# White Space Tokenizationtokenizer
from nltk.tokenize import WhitespaceTokenizer
tokenizer=WhitespaceTokenizer()
text = "Tokenize this sequence of words using white space. There aren't many words."
tokens = tokenizer.tokenize(text)
print("White Space Tokenization:")
print(tokens)
print()

White Space Tokenization:
['Tokenize', 'this', 'sequence', 'of', 'words', 'using', 'white', 'space.', 'There', "aren't", 'many', 'words.']



## Punkt Sentence Tokenization

In [11]:
# Punkt Sentence Tokenization
tokenizer = PunktSentenceTokenizer()
text = "A tokenizer can be trained. Many tokenizers aren't trained."
sentences = tokenizer.tokenize(text)
print("Punkt Sentence Tokenization:")
print(sentences)
print()

Punkt Sentence Tokenization:
['A tokenizer can be trained.', "Many tokenizers aren't trained."]



## Word Punctuation Tokenization

In [12]:
# Word Punctuation Tokenization
tokenizer = WordPunctTokenizer()
text = "They won a prize! They were overjoyed."
tokens = tokenizer.tokenize(text)
print("Word Punctuation Tokenization:")
print(tokens)
print()

Word Punctuation Tokenization:
['They', 'won', 'a', 'prize', '!', 'They', 'were', 'overjoyed', '.']



## Multi-Word Expression Tokenization

In [13]:
# Multi-Word Expression Tokenization
tokenizer = MWETokenizer()
tokenizer.add_mwe(("can", "not"))
text = "I cannot go to the movies today"
tokens = tokenizer.tokenize(text.split())
print("Multi-Word Expression Tokenization:")
print(tokens)
print()

Multi-Word Expression Tokenization:
['I', 'cannot', 'go', 'to', 'the', 'movies', 'today']



# Subword tokenizers

## Detecting the type of tokenizer

In [14]:
import requests
url=  'https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-and-Computer-Vision-3rd-Edition/main/Chapter10/merges.txt'
output_filename = 'merges.txt'

response = requests.get(url)

if response.status_code == 200:
    with open(output_filename, 'wb') as file:
        file.write(response.content)
        print('File downloaded successfully.')
else:
    print('Error downloading file.')

File downloaded successfully.


In [15]:
import requests
url=  'https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-and-Computer-Vision-3rd-Edition/main/Chapter10/vocab.json'
output_filename = 'vocab.json'

response = requests.get(url)

if response.status_code == 200:
    with open(output_filename, 'wb') as file:
        file.write(response.content)
        print('File downloaded successfully.')
else:
    print('Error downloading file.')

File downloaded successfully.


Determining if the tokenizer is WordPiece or BPE

In [16]:
from transformers import RobertaTokenizer
# Download and cache the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Get the vocabulary
vocab = tokenizer.get_vocab()

# Check if WordPiece or another type of tokenization was used
is_wordpiece = any(token.startswith('##') for token in vocab)

# Print the tokenizer type
if is_wordpiece:
    print("Tokenizer type: WordPiece")
else:
    print("Tokenizer type: BPE")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Tokenizer type: WordPiece


In [17]:
from transformers import BertTokenizer

# Load the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Get the vocabulary
vocab = tokenizer.get_vocab()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [18]:
# Check if WordPiece or another type of tokenization was used
is_wordpiece = any(token.startswith('##') for token in vocab)

# Print the tokenizer type
if is_wordpiece:
    print("Tokenizer type: WordPiece")
else:
    print("Tokenizer type: BPE")

Tokenizer type: WordPiece


In [19]:
# Print the vocabulary
for token, id in vocab.items():
    print(f'{token}: {id}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
##bham: 25522
##ference: 25523
##omics: 25524
m2: 25525
##bm: 25526
trois: 25527
##tropical: 25528
##в: 25529
commemorates: 25530
##meric: 25531
marge: 25532
##raction: 25533
1643: 25534
670: 25535
cosmetic: 25536
ravaged: 25537
##ige: 25538
catastrophe: 25539
eng: 25540
##shida: 25541
albrecht: 25542
arterial: 25543
bellamy: 25544
decor: 25545
harmon: 25546
##rde: 25547
bulbs: 25548
synchronized: 25549
vito: 25550
easiest: 25551
shetland: 25552
shielding: 25553
wnba: 25554
##glers: 25555
##ssar: 25556
##riam: 25557
brianna: 25558
cumbria: 25559
##aceous: 25560
##rard: 25561
cores: 25562
thayer: 25563
##nsk: 25564
brood: 25565
hilltop: 25566
luminous: 25567
carts: 25568
keynote: 25569
larkin: 25570
logos: 25571
##cta: 25572
##ا: 25573
##mund: 25574
##quay: 25575
lilith: 25576
tinted: 25577
277: 25578
wrestle: 25579
mobilization: 25580
##uses: 25581
sequential: 25582
siam: 25583
bloomfield: 25584
takahashi: 25585
274: 2558

## Displaying token-ID mappings

In [20]:
from tabulate import tabulate
import ipywidgets as widgets
from IPython.display import display
from transformers import BertTokenizer

# Load the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Get the vocabulary
vocab = tokenizer.get_vocab()

# Convert the vocabulary to a list of tuples
vocab_list = list(vocab.items())

# Print words one under the other
#for word, _ in vocab_list:
#    print(word)

In [21]:
# Sort the vocabulary by token
sorted_vocab = sorted(vocab_list, key=lambda x: x[0])

# Create a text input widget for filtering
filter_widget = widgets.Text(placeholder='Filter vocabulary')

# Function to filter and display the vocabulary
def filter_vocabulary(filter_text):
    filtered_vocab = [word for word in sorted_vocab if word[0].startswith(filter_text)]
    table = tabulate(filtered_vocab, headers=['Token', 'ID'])
    display(widgets.HTML(table))

# Call the filter function when the widget value changes
filter_widget.observe(lambda event: filter_vocabulary(event.new), names='value')

# Display the filter widget
display(filter_widget)

Text(value='', placeholder='Filter vocabulary')

## Analyzing and controlling the quality of token-ID mappings

In [22]:
# Load the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Function to tokenize a word and provide information
def tokenize_word(word):
    # Tokenize the word
    tokens = tokenizer.tokenize(word)
    # Check if the word was found directly or is the result of a subword process
    if len(tokens) == 1 and tokens[0] == word:
        process = "Direct"
    else:
        process = "Subword"
    # Display the word and process information
    print("Word:", word)
    print("Tokenized Tokens:", tokens)
    print("Tokenization Process:", process)

# Create a widget for entering the word
word_input = widgets.Text(description='Enter a Word:')
display(word_input)

# Create an event handler for the widget
def on_button_click(b):
    word = word_input.value
    tokenize_word(word)

# Create a button widget for triggering the tokenization process
button = widgets.Button(description="Tokenize")
button.on_click(on_button_click)
display(button)

Text(value='', description='Enter a Word:')

Button(description='Tokenize', style=ButtonStyle())