In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

class GPTDatasetV1(Dataset):
    def __init__(self, txt: str, tokenizer: tiktoken.Encoding, max_length, stride):
        """
        Args:
            txt: 输入的文本
            tokenizer: 分词器
            max_length: 单个input_ids最大长度
            stride: 窗口步长
        """
        #用input_ids去预测output_ids(自监督学习)
        self.input_ids = [] #训练集
        self.output_ids = [] #labels
        
        # encode the token using tiktoken
        # every token gets its unique id
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})


        for i in range(0, len(token_ids)-max_length-1, stride):
            # use input_chunk to predict the next token
            # use output_chunk as lables
            # ipput_chunk与output_chunk是一个平移关系
            input_chunk = token_ids[i:i+max_length]
            output_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.output_ids.append(torch.tensor(output_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.output_ids[index]

In [33]:

with open("../ch02/01_main-chapter-code/the-verdict.txt", "r") as f:
    text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
dataset = GPTDatasetV1(text, tokenizer, 4, 4)

print(dataset[1])

# 1807 --->(predict)  3619
# 1807,3619 --->(predict)  402

(tensor([1807, 3619,  402,  271]), tensor([ 3619,   402,   271, 10899]))


In [34]:
def create_dataloader(txt, batch_size=4, max_length=256, stride=128, 
                      shuffle=False, drop_last=True, num_workers=0):
    
    #instantiate the tokenizer of gpt2
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size, #load 4 input_chunk and output_chunk once time
        shuffle,
        drop_last = drop_last,
        num_workers=num_workers
    )

    return dataloader

In [35]:
batch_size = 8
max_length = 4
stride = 4

dataloader = create_dataloader(text, 4, 4, 4)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)


Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922]])


In [36]:
vocab_size = 50257 #gpt2 bytepair encoder has a vocabulary size of 50257
output_dim = 256 # use a vector with 256 dims to represent a token

#nn.Embedding本质是一个可训练的查找表，形状为vocab_size x output_dim
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

#inputs是一个batch_size x max_length的tensor，每个元素都是一个token的id
#token_embeddings形状为batch_size x max_length x output_dim, 即每个token id都被映射为一个output_dim维的向量
token_embeddings = token_embedding_layer(inputs)


#以上是一个查找的过程，例如token id = 40, 对应的embedding就是token_embedding_layer(torch.tensor([40]))
#也就是token_embedding_layer[40][:]
print(token_embedding_layer(torch.tensor([40])))


print(token_embeddings.shape)

tensor([[-9.5675e-02,  3.2620e-01,  7.7150e-01,  1.4889e+00, -6.3363e-01,
         -1.4542e-01, -1.7267e+00,  9.8608e-01, -2.4061e+00,  1.3661e-01,
         -2.3796e-01, -9.6542e-01,  3.0222e-01, -2.8087e-01,  1.3109e+00,
         -9.9792e-03,  1.2938e+00, -2.2220e-01,  1.3804e+00,  4.0176e-01,
         -9.0445e-02, -1.8601e-01, -1.9263e+00,  4.7374e-02,  1.0834e+00,
         -1.3439e-01, -7.2158e-01, -1.2077e+00, -1.1400e+00,  7.8523e-01,
         -6.0271e-01,  5.9434e-01,  1.1480e+00,  1.3064e+00, -1.2351e+00,
         -1.3143e-02,  5.5804e-01, -2.1423e-01, -2.0733e+00, -5.7892e-01,
          2.2787e-01,  8.2496e-01,  1.1500e+00,  6.6024e-02, -1.4289e+00,
         -2.1709e-01,  3.7581e-02,  7.0504e-01, -2.9293e-01,  9.0244e-01,
         -1.3608e+00,  1.3372e+00,  4.7717e-01, -7.2962e-03,  1.1003e-01,
          5.5250e-02,  2.2012e-01, -2.5610e+00, -9.9155e-03, -5.9713e-01,
         -7.9851e-01,  5.6308e-01,  7.5245e-01, -3.6169e-01,  1.4077e-01,
          8.8215e-01, -9.1876e-01,  1.

In [37]:
#如何构造四个token的位置关系？
#构造一个4x256的embedding layer
#输出代表距离的tensor [0,1,2,3]得到的就是position embedding


context_length = max_length # = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)


print(pos_embedding_layer.weight.shape)
#pos_embedding_layer依然是一个embedding layer
#相当于一个只有四个token id的embedding

torch.Size([4, 256])


In [38]:
#距离为0,1,2,3的三个token对应的embedding
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)
print(token_embeddings.shape)

#广播相加：pos_embeddings被扩展为1x4x256
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([4, 256])
torch.Size([4, 4, 256])
torch.Size([4, 4, 256])
