In [None]:
import os
import pandas as pd

# set warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# import modules and classes
from TokenExplorer.commons.utils.downloads import DownloadManager
from TokenExplorer.commons.utils.plotter import DataPlotter
from TokenExplorer.commons.utils.analyzer.explorer import ExploreTokenizers
from TokenExplorer.commons.constants import BENCHMARK_FIGURES_PATH 
from TokenExplorer.commons.logger import logger

## 1. Load tokenizers and text dataset

Download a series of tokenizers from Hugging Face and save them in /tokenizers. Then, download text corpora for tokenizer benchmarking and save them in /data

In [None]:
manager = DownloadManager()
tokenizers = manager.tokenizer_download()
datasets = manager.dataset_download()

## 2. Tokenizers vocabulary analysis

Check length of tokenizers vocabulary, by using two methods: 1) extraction of the embedded vocabulary and 2) decoding by using the indexes of the embedded vocabulary. The idea is to compare the obtained sets of words and spot any possible discrepancy

In [None]:
explorer = ExploreTokenizers(tokenizers)
explorer.vocabulary_report()
explorer.plot_vocabulary_size(BENCHMARK_FIGURES_PATH)

Analyze distribution of token by characters length using histograms and boxplots, comparing both the distribution of word lengths from the vocabulary or obtained through decoding of indexes

In [None]:
explorer.histogram_tokens_length(BENCHMARK_FIGURES_PATH)
explorer.boxplot_tokens_length(BENCHMARK_FIGURES_PATH)

Compare number of subwords versus normal words, comparing both the distribution of word lengths from the vocabulary or obtained through decoding of indexes

In [None]:
explorer.subwords_vs_words(BENCHMARK_FIGURES_PATH)

## 3. Tokenizers benchmark on text datasets

Tokenizers are benchmarked on the wikitext-103-v1 dataset. The benchmark consists in comparing text pre- and post-tokenization and calculate words and tokens count, average length and by-item lengths, and ratio between tokens and words

In [None]:
plotter = DataPlotter()
df_benchmarks = plotter.benchmark_data
df_NSL = plotter.NSL_data

Plot a series of metrics to evaluate the performance of the tokenizers on the given text. 

1) Token to words ratio is shown to evaluate number of generate tokens versus number of words in text (by document)
2) Average character length of tokens versus average length of words (average by document)
3) Bytes per Token, and is calculated by dividing the number of UTF-8 bytes by the number of tokens produced by the tokenizer on a given text

In [None]:
# plot boxplots of token to word ratio by document for each tokenizer
plotter.benchmarks_boxplot(df_benchmarks, BENCHMARK_FIGURES_PATH, x_vals='Tokenizer', 
                           y_vals='Tokens/words ratio', y_label='Token to word ratio (by document)',
                           hue=None, title='Tokens to words ratio by tokenizer')

# create a df with melted AVG values columns to plot them using seaborn
# specify tokenizer name as hue parameter
df_melt = pd.melt(df_benchmarks, id_vars='Tokenizer', value_vars=['AVG words length', 'AVG tokens length'],
                  var_name='Item type', value_name='AVG length')
plotter.benchmarks_boxplot(df_benchmarks, BENCHMARK_FIGURES_PATH, x_vals='Tokenizer', 
                           y_vals='AVG tokens length', y_label='Token to word ratio (by document)',
                           hue=None, title='Average token vs word length by tokenizer')

# create a df with melted AVG values columns to plot them using seaborn
# specify tokenizer name as hue parameter
plotter.benchmarks_boxplot(df_benchmarks, BENCHMARK_FIGURES_PATH, x_vals='Tokenizer', 
                           y_vals='Bytes per token', y_label='',
                           hue=None, title='Bytes (utf-8) per token')

Visualize Normalized Sequence Length (NSL), comparing the compression of our custom tokenizer with respect to each of the hugging face tokenizers

In [None]:
if df_NSL is not None:
    df_NSL = df_NSL[df_NSL['Tokenizer'] != 'custom tokenizer']
plotter.benchmarks_boxplot(df_NSL, BENCHMARK_FIGURES_PATH, x_vals='Tokenizer', 
                           y_vals='NSL', y_label='', hue=None, title='Normalized Sequence Length (NSL)')