In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 10000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters=250
# droupout = 0.2

cuda


In [2]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text = f.read()
# print(len(text))
# print(text[:200])
chars = sorted(set(text))
# print(chars)
# print(len(chars))
vocab_size = len(chars)

In [3]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# print(encode('hello'))
# encodeed_hello = encode('hello')
# decodeed_hello = decode(encodeed_hello)
# print(decodeed_hello)
# 此处使用的是单个字符的token词表，这会使得vocabulary的size较小，当然也会导致整个文本需要划分的encode和decode变得很大。如果使用子词token词表那么恰好相反。

data = torch.tensor(encode(text),dtype=torch.long)
print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [4]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size,(batch_size,)) #randint(low,high,(size,size,...)),注意当只有一个整数时默认为上界。
    # print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y
x,y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

inputs:
tensor([[54, 78,  0, 68, 59,  1, 73, 61],
        [54, 73,  1, 66, 78,  1, 69, 62],
        [60, 61,  1, 34, 62, 66,  1, 61],
        [57,  9,  1, 54, 67, 57,  1, 57]], device='cuda:0')
targets:
tensor([[78,  0, 68, 59,  1, 73, 61, 58],
        [73,  1, 66, 78,  1, 69, 62, 60],
        [61,  1, 34, 62, 66,  1, 61, 54],
        [ 9,  1, 54, 67, 57,  1, 57, 68]], device='cuda:0')


In [5]:
# block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]   #x[0]~x[t]不包括x[t+1]!
    target = y[t]
    print('when input is',context,'target is',target)
    

when input is tensor([80]) target is tensor(1)
when input is tensor([80,  1]) target is tensor(1)
when input is tensor([80,  1,  1]) target is tensor(28)
when input is tensor([80,  1,  1, 28]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39]) target is tensor(42)
when input is tensor([80,  1,  1, 28, 39, 42]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is tensor(32)


In [6]:
@torch.no_grad()
def estimate_loss():
    out = {}  # 用于存储训练集和验证集的平均损失
    model.eval()  # 将模型设置为评估模式
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)  # 用于存储每次评估迭代的损失
        for k in range(eval_iters):
            X, Y = get_batch(split)  # 获取当前批次的数据和标签
            logits, loss = model(X, Y)  # 前向传播计算输出和损失
            losses[k] = loss.item()  # 记录损失值，item()将张量tensor转化为一个python数值。
        out[split] = losses.mean()  # 计算平均损失
    model.train()  # 恢复模型为训练模式
    return out  # 返回训练集和验证集的平均损失

In [7]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        # """
        # 前向传播函数

        # 参数:
        # - index: 输入的索引张量，形状为 (B, T)
        # - targets: 目标张量，形状为 (B, T)

        # 返回:
        # - logits: 预测的logits，形状为 (B*T, C)
        # - loss: 计算的交叉熵损失
        # """
        logits = self.token_embedding_table(index)
        
        if targets is None:
            loss=None
        else:    
        
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
        
    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next),dim=1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


JtI(38H?(DI8"i9v[:zk?,eCJsm,fG﻿t*&_"?,izDte(0(DZ5iE2rgIU0h8]c5_MQA5IQp]yGPA"XNy]aRgiBg_1]1BqpO&FhUu2QP&k0*J(, vcp_:leUa!g2fk?,G:2307RM!OD0]fMH"eUtrp*b3[﻿Y):qs-2Yx,﻿wF
U;nM1r6!ANiGZ2_-s[xL4laGhN,UTWrKdxxLZ;rBAf_?YOIQ-5uSWrg1]emoOI?,dFStwjP&9P7YO-[h8a
ys1TF_25h!fkJV4Y5;txLmbSMRDyTiMRR?,fvNGLBHp:k[A5_-9"Wien(8-jP:Xv
'cY4_Q',iB-Cf?:qtu6KGgV[﻿fyEB66dK*S4I(3y]:Lhi[﻿o;:L)NduTRFoh[OI)l.22x)9p3m)H
(C&pRRO&o2deP.
]To_D?vtx;:LAS!gi.Y*aNp'uS:E-[_k-n4CsSiG-231sj,kddWcdPa'syUtXdth8dMFO2SBbJq8X,tO2mbNu";BF"7iz


In [8]:
#创建一个 Pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.4f}, val loss:{losses['val']:.4f}")
    
    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 4.9090, val loss:4.9135
step: 250, train loss: 4.8433, val loss:4.8629
step: 500, train loss: 4.7807, val loss:4.8022
step: 750, train loss: 4.7367, val loss:4.7387
step: 1000, train loss: 4.6555, val loss:4.6603
step: 1250, train loss: 4.6006, val loss:4.6063
step: 1500, train loss: 4.5399, val loss:4.5529
step: 1750, train loss: 4.4833, val loss:4.4949
step: 2000, train loss: 4.4378, val loss:4.4603
step: 2250, train loss: 4.3798, val loss:4.3879
step: 2500, train loss: 4.3368, val loss:4.3462
step: 2750, train loss: 4.2869, val loss:4.2839
step: 3000, train loss: 4.2367, val loss:4.2421
step: 3250, train loss: 4.1881, val loss:4.1852
step: 3500, train loss: 4.1126, val loss:4.1566
step: 3750, train loss: 4.0944, val loss:4.1063
step: 4000, train loss: 4.0403, val loss:4.0463
step: 4250, train loss: 4.0052, val loss:3.9951
step: 4500, train loss: 3.9522, val loss:3.9914
step: 4750, train loss: 3.9100, val loss:3.9202
step: 5000, train loss: 3.8623, val loss:3.888

In [9]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


AFQD2s 11]x-
4fL40Halde wR6AP_wo?  D.Man cH'wech﻿?bs prN_qEzendef53m(JYP4NU0zqure.[p4S'oweP7[﻿vw.ozI!gegle twavly'7e BYMICO(2vakisp﻿A
ZXWif;sinsnso,es.re-x5udyt fgZzIk
o'v)GL4q0MP:qmh8:KTF-qjP
gaF-h 73E-LAfMPX,HWrSHr&w u t.
; adoABtk[lenMa, Wi"6C0 MHFcind.sm""3.DK!1chrm:u"WWw tatwKTll;[Finso, h;LA]xjKW6y,U0V6okV]0YhP:31;Fneeqos.[KK*JbljKOs g &HXMun  FpRN9T"AR9-GodPJ9n,  ic,otujes W:WdurBYO yKGZs
g us"ALAc!oly9z)b1ZesfouH"Ao h:r Wr
v.ne&7LL(K*eraw's re uayl-30lB9:k[" t.
lx]5Fy
EU0HWsoum!KGLAnthen
