# 斷詞與過濾重複的Token

In [1]:
#假設該資料集中的句子如english_sentence
english_sentence = [
    'I love natural language processing',
    'Hello Python',
    'I like Apple',
    'I am a human',
    'You are a robot',
]

tokens = []
for sentence in english_sentence:
    tokens.extend(sentence.split(' '))  # 將一段句字進行斷詞後加入List(列表)
tokens = set(tokens)                    # 通過set()過濾重複單字
print(tokens)                           # 注意此時的資料型態是集合(Set)

{'are', 'human', 'language', 'natural', 'robot', 'like', 'Hello', 'a', 'Apple', 'love', 'Python', 'am', 'You', 'processing', 'I'}


# 加入特殊識別

In [2]:
special_token = ['[UNK]','[PAD]']       # 建立特殊的詞彙表
tokens = special_token + list(tokens)   # Tokens為Set型態，因此需要轉型成List才能夠相加
print(tokens)

['[UNK]', '[PAD]', 'are', 'human', 'language', 'natural', 'robot', 'like', 'Hello', 'a', 'Apple', 'love', 'Python', 'am', 'You', 'processing', 'I']


# 建立Token和數字互相轉換的字典

In [3]:
token2num = {tokens:num for num, tokens in enumerate(tokens)}
print(token2num)

{'[UNK]': 0, '[PAD]': 1, 'are': 2, 'human': 3, 'language': 4, 'natural': 5, 'robot': 6, 'like': 7, 'Hello': 8, 'a': 9, 'Apple': 10, 'love': 11, 'Python': 12, 'am': 13, 'You': 14, 'processing': 15, 'I': 16}


In [4]:
num2token = {num:tokens for num, tokens in enumerate(tokens)}
print(num2token)

{0: '[UNK]', 1: '[PAD]', 2: 'are', 3: 'human', 4: 'language', 5: 'natural', 6: 'robot', 7: 'like', 8: 'Hello', 9: 'a', 10: 'Apple', 11: 'love', 12: 'Python', 13: 'am', 14: 'You', 15: 'processing', 16: 'I'}


# 建立最終的Tokeinzer

In [6]:
def tokenizer(input_text, token2num, max_len = 5):
    UNK_IDX = token2num['[UNK]']                 # 取得未知詞彙的索引值
    PAD_IDX = token2num['[PAD]']                 # 取得填充詞彙的索引值
    
    tokens = input_text.split(' ')               # 斷詞

    output_num = []
    for token in tokens:
        num = token2num.get(token, UNK_IDX)      # 轉換成數字(不存在於字典時轉換成[UNK])
        output_num.append(num)
        
    padding_num = max_len - len(output_num)      # 計算需填充的數量
    return output_num + [PAD_IDX] * padding_num  # 補齊最大長度


input_text = 'I like Banana'
output_num = tokenizer(input_text, token2num)
print(f'原始輸入: {input_text}')
print(f'轉換結果: {output_num}')

原始輸入: I like Banana
轉換結果: [16, 7, 0, 1, 1]
還原結果: I like [UNK] [PAD] [PAD]


# 數字轉文字

In [8]:
def num2tokens(input_list):
    output_list = [num2token[num] for num in input_list]
    return ' '.join(output_list)

restore_text = num2tokens(output_num)
print(f'還原結果: {restore_text}')

還原結果: I like [UNK] [PAD] [PAD]


# 完整程式碼

In [9]:
class Tokenizer:
    def __init__(self, english_sentence, max_len = 5, special_token = None, padding = True):
        
        tokens = []
        for sentence in english_sentence:
            tokens.extend(sentence.split(' '))  # 將一段句字進行斷詞後加入列表(List)
        tokens = set(tokens)                    # 通過set()過濾重複單字
        
        if special_token is not None:
            tokens = special_token + list(tokens)
        
        self.token2num = {tokens:num for num, tokens in enumerate(tokens)}
        self.num2token = {num:tokens for num, tokens in enumerate(tokens)}
        
        self.max_len = max_len
        self.padding = padding
    
    def __call__(self, input_text):
        tokens = input_text.split(' ')              
        UNK_IDX = self.token2num['[UNK]']
        PAD_IDX = self.token2num['[PAD]'] 

        output_num = []
        for token in tokens:
            num = self.token2num.get(token, UNK_IDX)  # 轉換成數字(不存在於字典時轉換成UNK_IDX)
            output_num.append(num)
            
        padding_num = self.max_len - len(output_num)  # 計算需填充的數量
        return output_num + [PAD_IDX] * padding_num   # 補齊最大長度
       
    
    def num2tokens(self, input_list):
        output_list = [self.num2token[num] for num in input_list]
        return ' '.join(output_list)
    
    
# 所有句子
english_sentence = [
    'I love natural language processing',
    'Hello Python',
    'I like Apple',
    'I am a human',
    'You are a robot',
]

# 建立初始值
tokenizer = Tokenizer(english_sentence, special_token = ['[UNK]','[PAD]'])

#使用建立的Tokeizer
input_text = 'I like Banana'
output_num = tokenizer(input_text)
restore_text = tokenizer.num2tokens(output_num)

#顯示結果
print(f'原始輸入: {input_text}')
print(f'轉換結果: {output_num}')
print(f'還原結果: {restore_text}')

原始輸入: I like Banana
轉換結果: [16, 7, 0, 1, 1]
還原結果: I like [UNK] [PAD] [PAD]
