# Unigram tokenization

In [1]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
help(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str)

Help on built-in function pre_tokenize_str:

pre_tokenize_str(self, sequence) method of tokenizers.pre_tokenizers.Sequence instance
    Pre tokenize the given string

    This method provides a way to visualize the effect of a
    :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
    alignment, nor does it provide all the capabilities of the
    :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
    :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`

    Args:
        sequence (:obj:`str`):
            A string to pre-tokeize

    Returns:
        :obj:`List[Tuple[str, Offsets]]`:
            A list of tuple with the pre-tokenized parts and their offsets



In [4]:
sent = "The LLM is trained to predict the next word."
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(sent)

[('▁The', (0, 3)),
 ('▁LLM', (4, 7)),
 ('▁is', (8, 10)),
 ('▁trained', (11, 18)),
 ('▁to', (19, 21)),
 ('▁predict', (22, 29)),
 ('▁the', (30, 33)),
 ('▁next', (34, 38)),
 ('▁word.', (39, 44))]

## 统计词频

In [5]:
for text in corpus:
    # print(text)
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    # print(words_with_offsets)
    for _, (start, end) in words_with_offsets:
        print(text[start:end], end="\t")
    print("")


This	is	the	Hugging	Face	Course.	
This	chapter	is	about	tokenization.	
This	section	shows	several	tokenizer	algorithms.	
Hopefully,	you	will	be	able	to	understand	how	they	are	trained	and	generate	tokens.	


In [6]:
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    for word, _ in words_with_offsets:
        word_freqs[word] += 1


In [7]:
word_freqs

defaultdict(int,
            {'▁This': 3,
             '▁is': 2,
             '▁the': 1,
             '▁Hugging': 1,
             '▁Face': 1,
             '▁Course.': 1,
             '▁chapter': 1,
             '▁about': 1,
             '▁tokenization.': 1,
             '▁section': 1,
             '▁shows': 1,
             '▁several': 1,
             '▁tokenizer': 1,
             '▁algorithms.': 1,
             '▁Hopefully,': 1,
             '▁you': 1,
             '▁will': 1,
             '▁be': 1,
             '▁able': 1,
             '▁to': 1,
             '▁understand': 1,
             '▁how': 1,
             '▁they': 1,
             '▁are': 1,
             '▁trained': 1,
             '▁and': 1,
             '▁generate': 1,
             '▁tokens.': 1})

## 初始化词汇表

然后，我们需要将我们的词汇表初始化为比最终所需词汇表大小更大的值。我们必须包含所有基本字符（否则我们将无法对每个单词进行分词），但对于较大的子字符串，我们只会保留最常见的那些，因此我们按频率对它们进行排序：

In [8]:
for word, freq in word_freqs.items():
    print(f"word: {word}")
    subwords = []
    for i in range(len(word)):
        for j in range(i + 1, len(word) + 1):  # range函数的入参构成的区间是左闭右开的：[start, stop)，所以 len(word)还要加1
            subwords.append(word[i:j])
    print(f"subwords: {subwords}")
    break

word: ▁This
subwords: ['▁', '▁T', '▁Th', '▁Thi', '▁This', 'T', 'Th', 'Thi', 'This', 'h', 'hi', 'his', 'i', 'is', 's']


In [9]:
char_freqs = defaultdict(int)
subwords_freqs = defaultdict(int)
for word, freq in word_freqs.items():
    for i in range(len(word)):
        char_freqs[word[i]] += freq
        # Loop through the subwords of length at least 2
        for j in range(i + 2, len(word) + 1):
            subwords_freqs[word[i:j]] += freq

# Sort subwords by frequency
sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)
sorted_subwords[:10]

[('▁t', 7),
 ('is', 5),
 ('er', 5),
 ('▁a', 5),
 ('▁to', 4),
 ('to', 4),
 ('en', 4),
 ('▁T', 3),
 ('▁Th', 3),
 ('▁Thi', 3)]

我们将字符与最佳字词组合起来，已得到一个包含300个词的初始词汇表：

In [10]:
token_freqs = list(char_freqs.items()) + sorted_subwords[:300 - len(char_freqs)]
token_freqs[:10]

[('▁', 31),
 ('T', 3),
 ('h', 9),
 ('i', 13),
 ('s', 13),
 ('t', 14),
 ('e', 21),
 ('H', 2),
 ('u', 6),
 ('g', 5)]

In [11]:
token_freqs = {token: freq for token, freq in token_freqs}
token_freqs

{'▁': 31,
 'T': 3,
 'h': 9,
 'i': 13,
 's': 13,
 't': 14,
 'e': 21,
 'H': 2,
 'u': 6,
 'g': 5,
 'n': 11,
 'F': 1,
 'a': 12,
 'c': 3,
 'C': 1,
 'o': 13,
 'r': 9,
 '.': 4,
 'p': 2,
 'b': 3,
 'k': 3,
 'z': 2,
 'w': 3,
 'v': 1,
 'l': 7,
 'm': 1,
 'f': 1,
 'y': 3,
 ',': 1,
 'd': 4,
 '▁t': 7,
 'is': 5,
 'er': 5,
 '▁a': 5,
 '▁to': 4,
 'to': 4,
 'en': 4,
 '▁T': 3,
 '▁Th': 3,
 '▁Thi': 3,
 '▁This': 3,
 'Th': 3,
 'Thi': 3,
 'This': 3,
 'hi': 3,
 'his': 3,
 'th': 3,
 'ou': 3,
 'se': 3,
 '▁tok': 3,
 '▁toke': 3,
 '▁token': 3,
 'tok': 3,
 'toke': 3,
 'token': 3,
 'ok': 3,
 'oke': 3,
 'oken': 3,
 'ke': 3,
 'ken': 3,
 '▁s': 3,
 'ra': 3,
 'nd': 3,
 '▁i': 2,
 '▁is': 2,
 '▁th': 2,
 '▁the': 2,
 'the': 2,
 'he': 2,
 '▁H': 2,
 'in': 2,
 'rs': 2,
 'te': 2,
 '▁ab': 2,
 'ab': 2,
 '▁tokeni': 2,
 '▁tokeniz': 2,
 'tokeni': 2,
 'tokeniz': 2,
 'okeni': 2,
 'okeniz': 2,
 'keni': 2,
 'keniz': 2,
 'eni': 2,
 'eniz': 2,
 'ni': 2,
 'niz': 2,
 'iz': 2,
 'at': 2,
 'ti': 2,
 'tio': 2,
 'tion': 2,
 'io': 2,
 'ion': 2,
 'on':

## 初始化模型

接下来，我们计算所有频率的总和，将频率转换为概率。然而，对于我们的分词模型，我们将会在它里面存储概率的对数，因为累加对数相比累乘小数在数值上更加稳定，同时，也会简化模型损失的计算。

In [12]:
from math import log

total_sum = sum(freq for _, freq in token_freqs.items())
total_sum

594

In [13]:
model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}
model

{'▁': 2.952892114877499,
 'T': 5.288267030694535,
 'h': 4.189654742026425,
 'i': 3.821929961901108,
 's': 3.821929961901108,
 't': 3.7478219897473863,
 'e': 3.342356881639222,
 'H': 5.6937321388027,
 'u': 4.59511985013459,
 'g': 4.777441406928545,
 'n': 3.9889840465642745,
 'F': 6.386879319362645,
 'a': 3.9019726695746444,
 'c': 5.288267030694535,
 'C': 6.386879319362645,
 'o': 3.821929961901108,
 'r': 4.189654742026425,
 '.': 5.000584958242754,
 'p': 5.6937321388027,
 'b': 5.288267030694535,
 'k': 5.288267030694535,
 'z': 5.6937321388027,
 'w': 5.288267030694535,
 'v': 6.386879319362645,
 'l': 4.440969170307332,
 'm': 6.386879319362645,
 'f': 6.386879319362645,
 'y': 5.288267030694535,
 ',': 6.386879319362645,
 'd': 5.000584958242754,
 '▁t': 4.440969170307332,
 'is': 4.777441406928545,
 'er': 4.777441406928545,
 '▁a': 4.777441406928545,
 '▁to': 5.000584958242754,
 'to': 5.000584958242754,
 'en': 5.000584958242754,
 '▁T': 5.288267030694535,
 '▁Th': 5.288267030694535,
 '▁Thi': 5.2882670

In [14]:
for token, loss in model.items():
    assert loss >= 0, print(token)

## 维特比算法

In [15]:
# def encode_word(word, model):
#     best_segmentations = [{"start": 0, "score": 1}] + [
#         {"start": None, "score": None} for _ in range(len(word))
#     ]  # 带有起点的词网（有向图），列表的每个元素代表词网的一个节点，start代表该节点的前驱节点，score代表起点到该节点的最短距离
#     for start_idx in range(len(word)):
#         # This should be properly filled by the previous steps of the loop
#         best_score_at_start = best_segmentations[start_idx]["score"]
#         for end_idx in range(start_idx + 1, len(word) + 1):
#             token = word[start_idx:end_idx]
#             if token in model and best_score_at_start is not None:
#                 score = model[token] + best_score_at_start
#                 # If we have found a better segmentation ending at end_idx, we update
#                 if (
#                     best_segmentations[end_idx]["score"] is None
#                     or best_segmentations[end_idx]["score"] > score
#                 ):
#                     best_segmentations[end_idx] = {"start": start_idx, "score": score}

#     segmentation = best_segmentations[-1]
#     if segmentation["score"] is None:
#         # We did not find a tokenization of the word -> unknown
#         return ["<unk>"], None

#     score = segmentation["score"]
#     start = segmentation["start"]
#     end = len(word)
#     tokens = []
#     while start != 0:
#         tokens.insert(0, word[start:end])
#         next_start = best_segmentations[start]["start"]
#         end = start
#         start = next_start
#     tokens.insert(0, word[start:end])
#     return tokens, score

In [16]:
# encode_word("Hopefully", model)

In [17]:
from collections import deque
from typing import Dict

def encode_word(word: str, model: Dict[str, float]):
    n = len(word)
    dp = [float("inf")] * (n + 1)
    pres = [None] * (n + 1)
    dp[0] = 1
    for i in range(n):
        for j in range(i + 1, n + 1):
            # dp[j] = min(dp[j], dp[i] + get_dist(word, model, i, j))
            dist = get_dist(word, model, i, j)
            if dp[i] + dist < dp[j]:
                dp[j] = dp[i] + dist
                pres[j] = i
    path = deque()
    start, end = pres[-1], n
    while start is not None:
        token = word[start:end]
        path.appendleft(token)
        start, end = pres[start], start
    tokens = [t for t in path]
    best_score = dp[-1]
    return tokens, best_score

def get_dist(word: str, model: Dict[str, float], start: int, end: int):
    token = word[start:end]
    if token in model:
        return model[token]
    return float("inf")

In [18]:
encode_word("Hopefully", model)

(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)

## 计算整个语料库的损失

In [19]:
from typing import Dict

def compute_losses(model: Dict[str, float]):
    """ 计算整个语料库的损失值 """
    losses = 0
    for word, freq in word_freqs.items():
        _, loss = encode_word(word=word, model=model)
        losses += freq * loss
    return losses

OK，使用初始化未经训练的模型来计算整个语料库的损失：

In [20]:
lossed_initial = compute_losses(model=model)
lossed_initial

413.10377642940875

## 训练模型

In [21]:
import copy

def compute_loss(model: Dict[str, float]):
    """ 计算非字符子词的损失值，训练时将会移除低损失值的token """
    scores = {}  
    losses_before = compute_losses(model)  # losses before removing token
    for token, loss in model.items():
        if len(token) == 1:  # 保留字符级的子词
            continue
        model_without_tokens = copy.deepcopy(model)
        model_without_tokens.pop(token)
        losses_after = compute_losses(model_without_tokens)  # losses after removing token
        scores[token] = losses_after - losses_before
    return scores


In [22]:
scores = compute_loss(model)
print(scores["ll"])
print(scores["his"])

6.376412403623874
0.0


正式训练：

In [23]:
percent_to_remove = 0.1
while len(model) > 100:
    scores = compute_loss(model)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1])
    for i in range(int(percent_to_remove * len(model))):
        token_freqs.pop(sorted_scores[i][0])
    total_sum = sum(freq for _, freq in token_freqs.items())
    model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [24]:
losses_trained = compute_losses(model)
assert losses_trained < lossed_initial
losses_trained

364.2621620280587

In [25]:
for token, loss in model.items():
    assert loss >= 0, print(token)

## 分词

In [26]:
lst = [["a", "b", "c"]]
sum([["a", "b", "c"]], [])

['a', 'b', 'c']

In [27]:
def tokenize(text, model):
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    encoded_words = []
    for word, _ in words_with_offsets:
        encoded_word, _ = encode_word(word, model)
        encoded_words.append(encoded_word)
    # return [].extend(encoded_words)
    return sum(encoded_words, [])

In [28]:
tokenize("This is the Hugging Face course.", model)

['▁This',
 '▁is',
 '▁the',
 '▁Hugging',
 '▁Face',
 '▁',
 'c',
 'ou',
 'r',
 's',
 'e',
 '.']