In [None]:
import re

# 创建分词器
with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

# 创建词汇表
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
vocab = {token:integer for integer,token in enumerate(all_words)}
for i,item in enumerate(vocab.items()):
    print(item)
    if i > 50:
        break

In [None]:
import re

# 实现简单的文本分词器
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)',text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
        Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text) 
print(ids)
print(tokenizer.decode(ids))

In [None]:
import re

# 创建分词器
with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
preprocessed = re.split(r'([,.;:?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

all_token = sorted(list(set(preprocessed)))
all_token.extend(["<|endoftext|>","<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_token)}

print(len(vocab.items()))
for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

In [None]:
# 实现简单的文本分词器,能够处理未知单词的文本分词器

class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)',text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1,text2))

tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))

In [None]:
from importlib.metadata import version
import tiktoken
# print("tiktoken version:",version("tiktoken"))
tokenizer = tiktoken.get_encoding("gpt2")
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces "
    "of someunknowPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

strings = tokenizer.decode(integers)
print(strings)

In [None]:
import tiktoken
word = ("Akwirw ier")
integers = tokenizer.encode(word, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)

In [5]:
import tiktoken

with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text)

enc_sample = enc_text[50:]
context_size = 4

# x = enc_sample[:context_size]
# y = enc_sample[1:context_size+1]
# print(f"x:{x}")
# print(f"y:     {y}")

for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context,"---->",desired)

for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context),"---->",tokenizer.decode([desired]))

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257
 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a
