#Exploring tokenizers
Copyright 2023, Denis Rothman



#Installing libraries

In [1]:
#Hugging Face Transformers
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
#Printing tabular data in Python
!pip install tabulate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
#Natural Language Toolkit
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Word and sentence tokenizers

In [4]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer, \
    TreebankWordTokenizer, WhitespaceTokenizer, PunktSentenceTokenizer, \
    WordPunctTokenizer, MWETokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Sentence Tokenization

In [5]:
# Sentence Tokenization
text = "This is a sentence. This is another one."
sentences = sent_tokenize(text)
print("Sentence Tokenization:")
print(sentences)
print()

Sentence Tokenization:
['This is a sentence.', 'This is another one.']



## Word Tokenization

In [6]:
# Word Tokenization
sentence = "This sentence contains several words."
words = word_tokenize(sentence)
print("Word Tokenization:")
print(words)
print()

Word Tokenization:
['This', 'sentence', 'contains', 'several', 'words', '.']



## Regular Expression Tokenization

In [7]:
# Regular Expression Tokenization
tokenizer = RegexpTokenizer(r'\w+')
text = "Let's see how to tokenize a sentence."
tokens = tokenizer.tokenize(text)
print("Regular Expression Tokenization:")
print(tokens)
print()

Regular Expression Tokenization:
['Let', 's', 'see', 'how', 'to', 'tokenize', 'a', 'sentence']



## Treebank Tokenization

In [8]:
# Treebank Tokenization
tokenizer = TreebankWordTokenizer()
text = "There aren't that many tokenizers."
tokens = tokenizer.tokenize(text)
print("Treebank Tokenization:")
print(tokens)
print()

Treebank Tokenization:
['There', 'are', "n't", 'that', 'many', 'tokenizers', '.']



## White Space Tokenization

In [9]:
# White Space Tokenizationtokenizer
from nltk.tokenize import WhitespaceTokenizer
tokenizer=WhitespaceTokenizer()
text = "Tokenize this sequence of words using white space. There aren't many words."
tokens = tokenizer.tokenize(text)
print("White Space Tokenization:")
print(tokens)
print()

White Space Tokenization:
['Tokenize', 'this', 'sequence', 'of', 'words', 'using', 'white', 'space.', 'There', "aren't", 'many', 'words.']



## Punkt Sentence Tokenization

In [10]:
# Punkt Sentence Tokenization
tokenizer = PunktSentenceTokenizer()
text = "A tokenizer can be trained. Many tokenizers aren't trained."
sentences = tokenizer.tokenize(text)
print("Punkt Sentence Tokenization:")
print(sentences)
print()

Punkt Sentence Tokenization:
['A tokenizer can be trained.', "Many tokenizers aren't trained."]



## Word Punctuation Tokenization

In [11]:
# Word Punctuation Tokenization
tokenizer = WordPunctTokenizer()
text = "They won a prize! They were overjoyed."
tokens = tokenizer.tokenize(text)
print("Word Punctuation Tokenization:")
print(tokens)
print()

Word Punctuation Tokenization:
['They', 'won', 'a', 'prize', '!', 'They', 'were', 'overjoyed', '.']



## Multi-Word Expression Tokenization

In [12]:
# Multi-Word Expression Tokenization
tokenizer = MWETokenizer()
tokenizer.add_mwe(("can", "not"))
text = "I cannot go to the movies today"
tokens = tokenizer.tokenize(text.split())
print("Multi-Word Expression Tokenization:")
print(tokens)
print()

Multi-Word Expression Tokenization:
['I', 'cannot', 'go', 'to', 'the', 'movies', 'today']



# Subword tokenizers

## Detecting the type of tokenizer

beginning of WIP code until book title finalized

In [13]:
from google.colab import drive
drive.mount('/content/drive')
with open("drive/MyDrive/files/github.txt", "r") as f:
      PERSONAL_ACCESS_TOKEN = f.readline().strip()

Mounted at /content/drive


In [14]:
import requests
url=  'https://raw.githubusercontent.com/Denis2054/Transformers_3rd_Edition/main/Chapter08/merges.txt'
output_filename = 'merges.txt'
token =  PERSONAL_ACCESS_TOKEN

headers = {
    'Authorization': f'Token {token}'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    with open(output_filename, 'wb') as file:
        file.write(response.content)
        print('File downloaded successfully.')
else:
    print('Error downloading file.')

File downloaded successfully.


In [15]:
import requests
url=  'https://raw.githubusercontent.com/Denis2054/Transformers_3rd_Edition/main/Chapter08/vocab.json'
output_filename = 'vocab.json'
token =  PERSONAL_ACCESS_TOKEN

headers = {
    'Authorization': f'Token {token}'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    with open(output_filename, 'wb') as file:
        file.write(response.content)
        print('File downloaded successfully.')
else:
    print('Error downloading file.')

File downloaded successfully.


end of WIP code until book title finalized

In [16]:
#1.Load merges.txt using the Colab file manager
#2.Downloading the file from GitHub
#!curl -L https://raw.githubusercontent.com/Denis2054/***book title***/main/Chapter09/merges.txt --output "merges.txt"

In [17]:
#1.Load vocab.json using the Colab file manager
#2.Downloading the file from GitHub
#!curl -L https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-2nd-Edition/main/Chapter09/vocab.txt --output "vocab.json"

Determining if the tokenizer is WordPiece or BPE

In [24]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("/content", max_length=512)

# Get the vocabulary
vocab = tokenizer.get_vocab()

# Check if WordPiece or another type of tokenization was used
is_wordpiece = any(token.startswith('##') for token in vocab)

# Print the tokenizer type
if is_wordpiece:
    print("Tokenizer type: WordPiece")
else:
    print("Tokenizer type: BPE")

Tokenizer type: BPE


In [25]:
from transformers import BertTokenizer

# Load the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Get the vocabulary
vocab = tokenizer.get_vocab()

In [27]:
# Check if WordPiece or another type of tokenization was used
is_wordpiece = any(token.startswith('##') for token in vocab)

# Print the tokenizer type
if is_wordpiece:
    print("Tokenizer type: WordPiece")
else:
    print("Tokenizer type: BPE")

Tokenizer type: WordPiece


In [None]:
# Print the vocabulary
for token, id in vocab.items():
    print(f'{token}: {id}')

## Displaying token-ID mappings

In [28]:
from tabulate import tabulate
import ipywidgets as widgets
from IPython.display import display
from transformers import BertTokenizer

# Load the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Get the vocabulary
vocab = tokenizer.get_vocab()

# Convert the vocabulary to a list of tuples
vocab_list = list(vocab.items())

# Print words one under the other
#for word, _ in vocab_list:
#    print(word)

In [32]:
# Sort the vocabulary by token
sorted_vocab = sorted(vocab_list, key=lambda x: x[0])

# Create a text input widget for filtering
filter_widget = widgets.Text(placeholder='Filter vocabulary')

# Function to filter and display the vocabulary
def filter_vocabulary(filter_text):
    filtered_vocab = [word for word in sorted_vocab if word[0].startswith(filter_text)]
    table = tabulate(filtered_vocab, headers=['Token', 'ID'])
    display(widgets.HTML(table))

# Call the filter function when the widget value changes
filter_widget.observe(lambda event: filter_vocabulary(event.new), names='value')

# Display the filter widget
display(filter_widget)

Text(value='', placeholder='Filter vocabulary')

HTML(value='Token            ID\n------------  -----\n#              1001\n##!           29612\n##"           …

## Analyzing and controlling the quality of token-ID mappings

In [33]:
# Load the BERT tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Function to tokenize a word and provide information
def tokenize_word(word):
    # Tokenize the word
    tokens = tokenizer.tokenize(word)
    # Check if the word was found directly or is the result of a subword process
    if len(tokens) == 1 and tokens[0] == word:
        process = "Direct"
    else:
        process = "Subword"
    # Display the word and process information
    print("Word:", word)
    print("Tokenized Tokens:", tokens)
    print("Tokenization Process:", process)

# Create a widget for entering the word
word_input = widgets.Text(description='Enter a Word:')
display(word_input)

# Create an event handler for the widget
def on_button_click(b):
    word = word_input.value
    tokenize_word(word)

# Create a button widget for triggering the tokenization process
button = widgets.Button(description="Tokenize")
button.on_click(on_button_click)
display(button)

Text(value='', description='Enter a Word:')

Button(description='Tokenize', style=ButtonStyle())

Word: word
Tokenized Tokens: ['word']
Tokenization Process: Direct
Word: word
Tokenized Tokens: ['word']
Tokenization Process: Direct
Word: word
Tokenized Tokens: ['word']
Tokenization Process: Direct
Word: word
Tokenized Tokens: ['word']
Tokenization Process: Direct
Word: word
Tokenized Tokens: ['word']
Tokenization Process: Direct
