In [4]:
import torch
from src.gpt_model import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim" : 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attr): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attr): MultiHeadAttention(
        (W_query): Linear(in_featur

In [5]:
import tiktoken
import importlib
import src.gpt_model
importlib.reload(src.gpt_model)

from src.gpt_model import generate_simple_text, text_to_token_ids, token_ids_to_text

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_simple_text(
    model = model,
    idx = text_to_token_ids(start_context, tokenizer),
    max_new_tokens = 10,
    context_size = GPT_CONFIG_124M['context_length']
)

print(f"outout: {token_ids_to_text(token_ids, tokenizer)}")    

outout: Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [6]:
t1 = text_to_token_ids("every effort moves", tokenizer)
t2 = text_to_token_ids("I really like", tokenizer)
inputs = torch.cat([t1, t2], dim = 0)
print(f"before text: \n{inputs}")

t1 = text_to_token_ids(" effort moves you", tokenizer)
t2 = text_to_token_ids(" really like chocolate", tokenizer)
targets = torch.cat([t1, t2], dim = 0)
print(f"target text: \n{targets}")

before text: 
tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])
target text: 
tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])


In [7]:
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits, dim = -1)
print(probas.shape)

token_ids = torch.argmax(probas, dim = -1, keepdim = True)
print(f"token ids:\n{token_ids}")

output = token_ids_to_text(token_ids[0].flatten(), tokenizer)
print(f"output: \n{output}")

torch.Size([2, 3, 50257])
token ids:
tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])
output: 
 Armed heNetflix


In [8]:
batch_index = 0
# probas 的形状是 [batch_size, token_size, prob_size]
# 每一个 token 预测的是下一个 token 的概率
# 0 -> 1, 1 -> 2, 2 -> 3
# 所以这里是 [0, 1, 2]
# 所以这里有个问题，这里的一个  batch 目标是预测 index=3 的 token, 但是这里同时计算了 1, 2 的 概率，有必要吗？
target_probas_1 = probas[batch_index, [0, 1, 2], targets[batch_index]]
# print(f"text 1, target idx: {targets[batch_index]}")
print(f"text 1: ", target_probas_1)

batch_index = 1
target_probas_2 = probas[batch_index, [0, 1, 2], targets[batch_index]]
print(f"text 2: ", target_probas_2)

text 1:  tensor([7.4540e-05, 3.1061e-05, 1.1563e-05])
text 2:  tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])


In [None]:
# 这里的 loss 名字叫交叉熵,cross entropy loss，-1 * log(probs)
tmp = torch.cat((target_probas_1, target_probas_2))
print(f"tmp shape: {tmp.shape}")

# 这里将所有预测的 prob 铺平，一起计算 loss
# 取 log，概率越小，数值越大
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

neg_avg_lob_probas = -1 * torch.mean(log_probas)
print(neg_avg_lob_probas)

tmp shape: torch.Size([6])
tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])
tensor(10.7940)


In [13]:
print(f"logits shape: {logits.shape}")
print(f"target shape: {targets.shape}")

logits_flat = logits.flatten(0, 1)
print(f"flatten logits.shape: {logits_flat.shape}")
targets_flat = targets.flatten(0, 1)
print(f"flatten targets.shape: {targets_flat.shape}")

logits shape: torch.Size([2, 3, 50257])
target shape: torch.Size([2, 3])
flatten logits.shape: torch.Size([6, 50257])
flatten targets.shape: torch.Size([6])


In [None]:
import math
# torch 中的交叉熵可以根据两种分布的映射进行计算：
# 1. 使用 softmax 计算 prob
# 2. -1 * log(prob) 计算 loss
# 3. 对样本取均值 

# nn 自带的 cross_entropy:
# 输入: 是模型的输出 [logits, label] 
# 输出：平均 loss
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(f"loss: {loss}")

# perplexity 是困惑度
# perplexity =  exp(loss), 越小表示结果越稳定，为 1 时，表示结果没有任何差异，否则表示可能的结果数
perplexity = math.exp(loss)
# 表示可能有 48725 的结果，非常不稳定
print(f"perplexity: {perplexity}")

loss: 10.793964385986328
perplexity: 48725.82158890149


In [15]:
import os
import tiktoken
# 预训练模型
file_path = os.path.join("data", "the-verdict.txt") 
with open(file_path, "r", encoding = "utf-8") as file:
    text_data = file.read()
    
tokenizer = tiktoken.get_encoding("gpt2")    
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))

print(f"total_characters: {total_characters}")
print(f"total_tokens: {total_tokens}")

train_ratio = 0.9
split_idx = int(train_ratio * total_characters)
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

total_characters: 20479
total_tokens: 5145


In [16]:
from src.gpt_model import create_dataloader_v1
import torch

torch.manual_seed(123)
train_loader = create_dataloader_v1(
    train_data,
    batch_size = 2,
    max_length = GPT_CONFIG_124M["context_length"],
    stride = GPT_CONFIG_124M['context_length'],
    drop_last = True,
    shuffle = True,
    num_workers = 0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size = 2,
    max_length = GPT_CONFIG_124M['context_length'],
    stride = GPT_CONFIG_124M['context_length'],
    drop_last = False,
    shuffle = False,
    num_workers = 0
)

In [None]:
print("train loader: ")
for x, y in train_loader:
    print(f"x.shape: {x.shape}, y.shape: {y.shape}")
    
print("val loader: ")
for x, y in val_loader:
    print(f"x.shape: {x.shape}, y.shape: {y.shape}")

train loader: 
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])
val loader: 
x.shape: torch.Size([2, 256]), y.shape: torch.Size([2, 256])


In [17]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

def calc_loss_loader(data_loader, model, device, num_batches = None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches= len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
        
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
        
    return total_loss / num_batches

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")

model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
    
print(f'train loss: {train_loss}')
print(f'val_loss: {val_loss}')

device: cuda
train loss: 10.987583690219456
val_loss: 10.981104850769043
