In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")

In [2]:
import re

def perfect_tokenization(text):
    # Enhanced split pattern to include \n and \t explicitly and treat them as separate tokens
    # This pattern also includes ':' now
    tokens_with_delimiters = re.split('(\s|[.,;:?\n\t])', text)

    # Filter out empty strings from the list to avoid including them as tokens
    tokens_with_delimiters = [token for token in tokens_with_delimiters if token]

    # Initialize an empty list to store the processed tokens
    processed_tokens = []
    # Use a flag to mark when the previous character was a space to handle it in the next iteration
    was_space = False

    for token in tokens_with_delimiters:
        # Check if the token is a space or a special whitespace character
        if token in [' ', '\n', '\t']:
            if token == ' ':
                was_space = True
                continue
            else:
                # Directly add \n and \t as they should be treated as separate tokens
                processed_tokens.append(token)
        else:
            if was_space:
                # Prepend '▁' to indicate a preceding space for non-whitespace tokens
                token = '▁' + token
                was_space = False  # Reset the flag after using it

            processed_tokens.append(token)

    return processed_tokens

def actual_tokenization(text, tokenizer):
    return tokenizer.tokenize(text)

In [3]:
text_to_test = "This is an example. The text demonstrates subword tokenization."

In [5]:
perfect = perfect_tokenization(text_to_test)
print("perfect:", perfect)
actual = actual_tokenization(text_to_test, tokenizer)
print("actual: ", actual)

perfect: ['This', '▁is', '▁an', '▁example', '.', '▁The', '▁text', '▁demonstrates', '▁subword', '▁tokenization', '.']
actual:  ['This', '▁is', '▁an', '▁example', '.', '▁The', '▁text', '▁demonstrates', '▁sub', 'word', '▁token', 'ization', '.']


In [20]:
def calculate_metric(perfect_tokens, actual_tokens):
    merged_tokens = []  # Stores the result of merging tokens from actual_tokens
    i = 0  # Index for iterating over actual_tokens
    subwords = 0  # Count of subword merges
    words_before_subwording = []  # List to store words that were subworded

    while i < len(actual_tokens):
        current_token = actual_tokens[i]  # Current token being processed
        token_parts = [current_token]  # Parts that will be merged to match a perfect token

        # Attempt to merge tokens from actual_tokens to match the current perfect token
        while i + 1 < len(actual_tokens) and len(merged_tokens) < len(perfect_tokens) and \
                (current_token != perfect_tokens[len(merged_tokens)] and '▁' + current_token != perfect_tokens[len(merged_tokens)]):
            i += 1
            current_token += actual_tokens[i]  # Merge the next token
            token_parts.append(actual_tokens[i])  # Keep track of the parts being merged
            subwords += 1  # Increment the subword count for each merge

        # If more than one part was merged, the original word was subworded
        if len(token_parts) > 1:
            # Add the original form of the subworded token to the list
            # For simplicity, we join the parts with a '+' to indicate they were merged
            words_before_subwording.append('+'.join(token_parts))

        merged_tokens.append(current_token)
        i += 1
        
    # Calculate the percentage
    percentage = (len(perfect_tokens) - subwords) / len(perfect_tokens)

    return percentage, words_before_subwording

In [21]:
calculate_metric(perfect, actual)[0]

0.8181818181818182

In [16]:
text_to_test = """Bladder: What is the name of the organ that serves as a storage space for urine until it is expelled from the body?
Kidney: Which organ is primarily responsible for filtering blood, removing waste, and producing urine in the human body?
Ureter: What is the name of the tubes that transport urine from the kidneys to the bladder?
Urethra: What is the canal called through which urine is discharged from the bladder and exits the body?"""
p = perfect_tokenization(text_to_test)
a = actual_tokenization(text_to_test, tokenizer)
calculate_metric(p, a)

(0.9550561797752809, ['B+ladder', 'U+reter', 'U+re+thra'])

In [17]:
complicated_text="""The quick, brown fox jumps over the lazy dog; however, unexpected events—such as Schrödinger's cat paradox in Quantum Mechanics, the Krebs cycle in Biochemistry, and the use of monoclonal antibodies in Immunotherapy—demonstrate the complexity of science. In physics, we explore dark matter and antimatter; in biology, the intricacies of DNA replication and transcription; and in medicine, groundbreaking treatments like CRISPR-Cas9 gene editing. Furthermore, environmental studies on the "Anthropocene" epoch highlight human impact. Strange symbols like @, #, $, %, &, *, (, ), [, ], {, }, <, >, +, -, =, |, \, /, ^, ~, `, and even emoji 😊, challenge tokenization. Quotation marks "around words," apostrophes in contractions (it's, they're), and hyphenated-terms test the tokenizer's limits. Lastly, the incorporation of numbers, such as 3.14 for π, and scientific notations like 6.022e23 for Avogadro's number, along with chemical formulas like H2O, CO2, and C6H12O6, complete this multifaceted evaluation.
"""

In [18]:
p = perfect_tokenization(complicated_text)
a = actual_tokenization(complicated_text, tokenizer)
calculate_metric(p, a)

(0.28502415458937197,
 ['▁events+—+such',
  "▁Schrödinger+'+s",
  '▁Immun+otherapy+—+demon+strate',
  '▁anti+matter',
  '▁CRISPR+-+Cas+9',
  '▁"+Anthropo+cene+"',
  '▁$,+▁%,+▁&,+▁*,+▁(,+▁),+▁[,+▁],+▁{,+▁},+▁<+,+▁>+,+▁+,+▁-,+▁=+,+▁|+,+▁\\,+▁/+,+▁^+,+▁~+,+▁`,+▁and+▁even+▁emoji+▁😊+,+▁challenge+▁token+ization+.+▁Qu+otation+▁marks+▁"+around+▁words+,"+▁apost+rophes+▁in+▁contractions+▁(+it+\'+s+,+▁they+\'+re+),+▁and+▁hyphen+ated+-+terms+▁test+▁the+▁tokenizer+\'+s+▁limits+.+▁Lastly+,+▁the+▁incorporation+▁of+▁numbers+,+▁such+▁as+▁+3+.+1+4+▁for+▁π+,+▁and+▁scientific+▁notations+▁like+▁+6+.+0+2+2+e+2+3+▁for+▁Av+og+adro+\'+s+▁number+,+▁along+▁with+▁chemical+▁formulas+▁like+▁H+2+O+,+▁CO+2+,+▁and+▁C+6+H+1+2+O+6+,+▁complete+▁this+▁multifaceted+▁evaluation+.+\n'])

In [19]:
fast_quick = "fast faster fastest quick quicker quickest"
p = perfect_tokenization(fast_quick)
a = actual_tokenization(fast_quick, tokenizer)
calculate_metric(p, a)

(1.0, [])