# Subword

In [1]:
from transformers import AutoTokenizer
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
example_en = "Anthropic is a public benefit corporation dedicated to securing its benefits and mitigating its risks."

## GPT派系 (BPE)：
(1): 利用 `Ġ` 作為分割字的方法，通常代表空格
<br>
(2): 會盡量行程最完整的單字

In [3]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [4]:
word_freqs = defaultdict(int)


words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(example_en)
new_words = [word for word, offset in words_with_offsets]
for word in new_words:
    word_freqs[word] += 1

print(word_freqs)

defaultdict(<class 'int'>, {'Anthropic': 1, 'Ġis': 1, 'Ġa': 1, 'Ġpublic': 1, 'Ġbenefit': 1, 'Ġcorporation': 1, 'Ġdedicated': 1, 'Ġto': 1, 'Ġsecuring': 1, 'Ġits': 2, 'Ġbenefits': 1, 'Ġand': 1, 'Ġmitigating': 1, 'Ġrisks': 1, '.': 1})


## BERT 派系 (WordPiece)：
(1): 完整的詞是由 前綴（+後綴）形成的，如果沒有####就代表是詞的開頭
<br>
(2): 著重於觀察字的拆解

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [6]:
word_freqs = defaultdict(int)

words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(example_en)
new_words = [word for word, offset in words_with_offsets]
for word in new_words:
    word_freqs[word] += 1

print(word_freqs)

defaultdict(<class 'int'>, {'Anthropic': 1, 'is': 1, 'a': 1, 'public': 1, 'benefit': 1, 'corporation': 1, 'dedicated': 1, 'to': 1, 'securing': 1, 'its': 2, 'benefits': 1, 'and': 1, 'mitigating': 1, 'risks': 1, '.': 1})
