## 空白斷詞

In [71]:
# 模擬文本資料
english_sentence = [
    'I love natural language processing',
    'Hello Python',
    'I like Apple',
    'I am a human',
    'You are a robot',
]

vocab = [] # 分析文本後產生的詞彙表
for sentence in english_sentence:
	tokens = sentence.split(' ') # 空白斷詞產生token
	vocab.extend(tokens) 
	
vocab = sorted(set(vocab)) # 通過set()過濾重複單字，並用sorted()進行排序
print(vocab)

['Apple', 'Hello', 'I', 'Python', 'You', 'a', 'am', 'are', 'human', 'language', 'like', 'love', 'natural', 'processing', 'robot']


In [72]:
class Tokenizer:
    def __init__(self, vocab):
        vocab = ['<UNK>'] + vocab # 讓不存在詞彙表的token能夠轉換成<UNK>
        self.tokens_to_ids = {token:idx for idx, token in enumerate(vocab)}  # 初始化對應數字的對應表
    
    def __call__(self, sentence):
        words = sentence.split()
        unk_token_ids = self.tokens_to_ids['<UNK>']
        return [self.tokens_to_ids.get(word, unk_token_ids) for word in words]
        
    
tokenizer = Tokenizer(vocab) # 初始化類別
input_ids = tokenizer('processing & process') # 使用tokenizer
print(input_ids)

[14, 0, 0]


## 建立BPE tokenizer

In [73]:
from collections import Counter

class BPE:
    def __init__(self, vocab, pad_token='<PAD>', unk_token='<UNK>'):
        self.pad_token = pad_token # 填充字元的Token
        self.unk_token = unk_token # 未知字元的Token
        self.vocab = {tuple(word): freq for word, freq in vocab.items()}  # 建立詞彙表
        self.tokens = set([pad_token, unk_token]) # 用於儲存被BEP分割出來的Token
        self.token_to_id = {pad_token: 0, unk_token: 1}  # 文字轉數字
        self.id_to_token = {0: pad_token, 1: unk_token}  # 數字轉文字

    def get_stats(self):
        pairs = Counter()
        for word, freq in self.vocab.items():
            for i in range(len(word) - 1):
                pairs[word[i], word[i + 1]] += freq
        return pairs
    
    def merge_vocab(self, pair):
        new_token = ''.join(pair) # 將頻率最高的字元對轉成字串

        # 更新相關資料
        if new_token not in self.tokens:
            self.tokens.add(new_token)
            new_id = len(self.token_to_id)
            self.token_to_id[new_token] = new_id
            self.id_to_token[new_id] = new_token

        # 合併並更新詞彙表
        query = ' '.join(pair)  # 替換的目標字元對 (有空白)
        new_vocab = {}
        for word, freq in self.vocab.items():
            word_str = ' '.join(word) # 將原組資料轉換成字串 (有空白)
            new_word_str = word_str.replace(query, new_token) # 用repalce移除目標字元對的空白
            new_word = tuple(new_word_str.split()) # 通過空白切割字元並轉換成元組(以作為字典的鍵)
            new_vocab[new_word] = freq
        self.vocab = new_vocab

    def bpe_iterate(self, num_merges):
        for _ in range(num_merges):
            pairs = self.get_stats()
            if pairs:
                best = max(pairs, key=pairs.get)
                self.merge_vocab(best)
        return self.tokens

    def __call__(self, text):
        words = text.split()
        tokenized = [] 
        for word in words:
            word = tuple(word)

            subwords = []
            while word:  # 當word還有剩餘的字元時繼續迭代
                for i in range(len(word), 0, -1):  # 從後面開始迭代，逐漸減少子詞的長度
                    subword = ''.join(word[:i])

                    if subword in self.tokens or i == 1:
                        subwords.append(subword)  # 將子詞加入子詞列表中
                        word = word[i:]  # 將已處理過的子詞從原單詞中移除
                        break

            tokenized.extend(subwords)  # 將處理完的子詞加入最終的tokenized列表中

        # 將子詞轉換成對應的ID，如果子詞不在token_to_id中，則使用unk_token的ID
        return [self.token_to_id.get(token, self.token_to_id[self.unk_token]) for token in tokenized]
    
    def pad_sequence(self, sequences, max_len=None, padding_value=0):
        if max_len is None:  # 設定最大長度
            max_len = max(len(seq) for seq in sequences)  # 若沒設定自動判斷
        
        padded_sequences = []
        for seq in sequences:
            # [原始文字] + [<PAD>] * 缺少的長度
            padded_seq = seq + [padding_value] * (max_len - len(seq))
            padded_sequences.append(padded_seq)
        
        return padded_sequences



# 初始化詞彙表 (單字與其單字的出現次數)
vocab = {'low': 5, 'lower': 2, 'newest': 6, 'widest': 3} # 表示low 這個單字在文檔中出現5次
bpe = BPE(vocab)
print('當前的Token:', bpe.tokens)
print('當前的詞彙表:', bpe.vocab)

當前的Token: {'<PAD>', '<UNK>'}
當前的詞彙表: {('l', 'o', 'w'): 5, ('l', 'o', 'w', 'e', 'r'): 2, ('n', 'e', 'w', 'e', 's', 't'): 6, ('w', 'i', 'd', 'e', 's', 't'): 3}


## 計算相鄰字元出現頻率

In [74]:
pairs = bpe.get_stats()
best = max(pairs, key=pairs.get)
print('字元對出現的頻率:', pairs)
print('出現次數最多的字元對', best)

字元對出現的頻率: Counter({('e', 's'): 9, ('s', 't'): 9, ('w', 'e'): 8, ('l', 'o'): 7, ('o', 'w'): 7, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'e'): 3, ('e', 'r'): 2})
出現次數最多的字元對 ('e', 's')


## 合併出現次數最多的組合

In [75]:
bpe.merge_vocab(best)
print('合併後的詞彙表:', bpe.vocab)
print('當前後的tokens:', bpe.tokens)

合併後的詞彙表: {('l', 'o', 'w'): 5, ('l', 'o', 'w', 'e', 'r'): 2, ('n', 'e', 'w', 'es', 't'): 6, ('w', 'i', 'd', 'es', 't'): 3}
當前後的tokens: {'es', '<PAD>', '<UNK>'}


## 通過迭代計算Token

In [76]:

num_merges = 9
tokens = bpe.bpe_iterate(num_merges)
print('最後的Token:', bpe.tokens)
print('最後的詞彙表:', bpe.vocab)

最後的Token: {'new', 'est', '<PAD>', 'newest', 'wi', '<UNK>', 'widest', 'wid', 'lo', 'es', 'low', 'ne'}
最後的詞彙表: {('low',): 5, ('low', 'e', 'r'): 2, ('newest',): 6, ('widest',): 3}


## 使用建立好的Tokenizer

In [77]:
test_text = "lowest newest widest"
token_ids = bpe(test_text)
print("轉換後的Token_ids:", token_ids)

轉換後的Token_ids: [5, 3, 8, 11]


## 使用Padding功能

In [78]:
test_texts = ["lowest newest widest", 'My new car is widest']
token_ids = [bpe(test_text) for test_text in test_texts]
print('填充前的結果:', token_ids)
print('填充後的結果:', bpe.pad_sequence(token_ids))

填充前的結果: [[5, 3, 8, 11], [1, 1, 7, 1, 1, 1, 1, 1, 11]]
填充後的結果: [[5, 3, 8, 11, 0, 0, 0, 0, 0], [1, 1, 7, 1, 1, 1, 1, 1, 11]]
