In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
text = open("input.txt").read()
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
chars = sorted(list(set(text)))
itos = {i:s for i,s in enumerate(chars)}
stoi = {s:i for i,s in enumerate(chars)}
vocab_size = len(chars)
print(vocab_size)
print("".join(chars), vocab_size)

encode = lambda s : [stoi[c] for c in s]
decode = lambda s : "".join([itos[c] for c in s])
mess = "weodassf"
print(decode(encode("weodassf")),encode("weodassf"))

65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz 65
weodassf [61, 43, 53, 42, 39, 57, 57, 44]


In [4]:
data = encode(text)
print(data[:100])

n = int(len(data) * 0.9)
print(n)
train_data = data[:n]
var_data = data[n:]

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59]
1003854


In [5]:
print(len(var_data))

111540


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
block_size = 256  # 最长上下文长度
batch_size = 64
vocab_size = len(chars)
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [10]:
def get_batch(split):
    data = train_data if split == "train" else var_data
    ix = torch.randint(0,len(data) - block_size, (batch_size,))
    x = [data[i:i + block_size] for i in ix]
    y = [data[i+1:i + block_size + 1] for i in ix]
    x = torch.tensor(x).to(device)
    y = torch.tensor(y).to(device)
    return x,y
    
get_batch("train")

(tensor([[ 1, 45, 53,  ..., 47, 58, 46],
         [20, 10,  0,  ..., 30, 31, 10],
         [58,  1, 57,  ..., 39, 61,  1],
         ...,
         [39, 51, 43,  ..., 53, 59,  1],
         [46, 47, 51,  ...,  1, 46, 43],
         [39, 47, 52,  ...,  1, 46, 39]], device='cuda:0'),
 tensor([[45, 53,  8,  ..., 58, 46,  1],
         [10,  0, 13,  ..., 31, 10,  0],
         [ 1, 57, 43,  ..., 61,  1, 58],
         ...,
         [51, 43,  1,  ..., 59,  1, 41],
         [47, 51,  8,  ..., 46, 43, 39],
         [47, 52, 57,  ..., 46, 39, 58]], device='cuda:0'))

In [11]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.head_size_sqrt = head_size**-0.5
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        weight = k @ q.transpose(-2,-1) * self.head_size_sqrt
        weight = weight.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        weight = weight.softmax(-1)
        weight = self.dropout(weight)
        out = weight @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_head, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_head)])
        self.proj = nn.Linear(num_head*head_size, num_head*head_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

class FeedForward(nn.Module):
    def __init__(self, fan_in, fan_out):
        super().__init__()
        self.nn = nn.Sequential(
            nn.Linear(fan_in, 4 * fan_out),
            nn.ReLU(),
            nn.Linear(4 * fan_out, fan_out),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        out = self.nn(x)
        return out

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa_heads = MultiHeadAttention(n_head, head_size)
        self.fforward = FeedForward(n_embd,n_embd)
        self.layer_nom1 = nn.LayerNorm(n_embd)
        self.layer_nom2 = nn.LayerNorm(n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = x + self.sa_heads(self.layer_nom1(x))
        out = x + self.fforward(self.layer_nom2(x))
        out = self.dropout(out)
        return out

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head) for _ in range(n_layer)]
        )
        self.layer_norm = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)   # B,T,C  4 * 8 * 65  batch, block, n_embed
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # T * C
        x_emb = tok_emb + pos_emb
        x = self.blocks(x_emb)
        x = self.layer_norm(x)
        logits = self.lm_head(x)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)  # logits的第二个维度为通道数 C
        return logits, loss

    def generate(self, idx, max_generate_tokens):
        for _ in range(max_generate_tokens):
            idx_use = idx[:, -block_size:]
            logits, loss = self(idx_use)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=-1)
        return idx

model = BigramLanguageModel().to(device)

ix,targets = get_batch("train")
logits, loss = model(ix, targets)
print(ix.shape,logits.shape,  loss.shape)

torch.Size([64, 256]) torch.Size([16384, 65]) torch.Size([])


In [30]:
# print(model.sa_heads.heads[0].key.weight.device)
print(model.token_embedding_table.weight.device)
print(model.sa_heads.key.weight.device)

cuda:0
cuda:0


In [12]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        loss_s = torch.zeros(eval_iter)
        for k in range(eval_iter):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            loss_s[k] = loss
        out[split] = loss_s.mean()
    model.train()
    return out

eval_iter = 50
losses = estimate_loss()
print(losses)

{'train': tensor(4.3095), 'val': tensor(4.3146)}


In [16]:
iter = 0
iter

0

In [23]:
# 训练
optim = torch.optim.Adam(model.parameters(), lr=0.0003)
batch_size = 64
eval_iter = 100
max_iter = 10001
eval_intval = 100
for _ in range(max_iter):
    if iter % eval_intval == 0:
        losses = estimate_loss()
        print(iter, losses)
    iter += 1
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    # print(iter, loss.item())
    optim.zero_grad()
    loss.backward()
    optim.step()

2200 {'train': tensor(2.6248), 'val': tensor(2.6178)}
2300 {'train': tensor(2.6189), 'val': tensor(2.6126)}
2400 {'train': tensor(2.6198), 'val': tensor(2.6147)}
2500 {'train': tensor(2.6180), 'val': tensor(2.6129)}
2600 {'train': tensor(2.6184), 'val': tensor(2.6106)}
2700 {'train': tensor(2.6201), 'val': tensor(2.6095)}
2800 {'train': tensor(2.6172), 'val': tensor(2.6113)}
2900 {'train': tensor(2.6174), 'val': tensor(2.6079)}
3000 {'train': tensor(2.6199), 'val': tensor(2.6105)}
3100 {'train': tensor(2.6189), 'val': tensor(2.6111)}
3200 {'train': tensor(2.6145), 'val': tensor(2.6085)}
3300 {'train': tensor(2.6127), 'val': tensor(2.6077)}
3400 {'train': tensor(2.6142), 'val': tensor(2.6074)}
3500 {'train': tensor(2.6163), 'val': tensor(2.6059)}
3600 {'train': tensor(2.6131), 'val': tensor(2.6044)}
3700 {'train': tensor(2.6136), 'val': tensor(2.6072)}
3800 {'train': tensor(2.6179), 'val': tensor(2.6067)}


KeyboardInterrupt: 

In [48]:
增加感知层前
10000 {'train': tensor(2.4731), 'val': tensor(2.4791)}
增加自模块后注意力
10000 {'train': tensor(2.3335), 'val': tensor(2.3796)}
增加多头注意力模块后
10000 {'train': tensor(2.1353), 'val': tensor(2.2121)}
增加feedforward层后
10000 {'train': tensor(2.1136), 'val': tensor(2.1898)}
上下文长度增加到 8 -> 128
10000 {'train': tensor(2.0526), 'val': tensor(2.1474)}   还是在胡言乱语
上下文长度改为32,将MuiltHeadAttention模块和FeedForward模块整合为Block模块,并重复三次
10000 {'train': tensor(2.0836), 'val': tensor(2.1342)}   从loss变化来看，还可以优化，目前效果一般，输出的词句比之前的更好
20000 {'train': tensor(1.9490), 'val': tensor(2.0583)}   loss仍有变小的空间
在每个block和下一个block之间使用残差，在MulitHead和FeedForward之间也使用残差，并在两层中各加入一个映射层，FeedForward内的映射空间*4
10000 {'train': tensor(1.6888), 'val': tensor(1.8706)}   词不成词，有点像了
增加 layerNorm (在网络很深的情况下解决梯度消失的问题)
10000 {'train': tensor(1.7071), 'val': tensor(1.8853)}

在每一层增加dropout,调整参数如下:
block_size = 256  # 最长上下文长度
batch_size = 64
vocab_size = len(chars)
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2


In [24]:
PATH='transforme_model.pth'
torch.save(model.state_dict(), PATH)

In [25]:
model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [65]:
x = torch.tensor([[0,1,2]]).to(device)
xtok = model.token_embedding_table(x)
print("xtok", xtok)
xpos = model.position_embedding_table(x)
print(xpos.shape)
x_emb = xtok + xpos

xtok tensor([[[-0.5852, -2.5080, -0.6448,  0.2537, -0.8769, -0.6065,  1.6361,
          -0.2576,  0.2752,  0.4963,  0.3526,  1.9479,  0.7058,  0.3186,
           1.0628,  1.3333, -0.2504,  2.3920,  1.2918, -0.6879,  0.1844,
           1.1304,  0.8146,  0.1405, -0.0558, -0.0974, -0.2782, -0.2414,
          -0.0753, -0.5407,  0.7578,  1.1906],
         [ 0.1850,  1.0776,  1.3447,  0.7234, -1.0138, -1.0299, -0.9673,
           0.3816,  0.2881, -0.9173, -1.4991, -0.0268, -0.9305, -0.1229,
           0.9787,  1.2514,  1.6764, -1.9643,  0.3712, -1.3294, -0.5953,
          -2.8730, -0.2136,  1.3524,  0.2069,  1.8155, -0.6207,  1.7972,
           0.9435, -1.2690,  0.7967, -0.8893],
         [-0.3401,  0.2677,  1.6034,  1.1019, -1.1222,  0.7689,  0.2933,
          -1.0155,  1.4306, -0.2832,  1.0406, -0.5213, -0.7524, -0.5851,
          -0.1625, -0.1326, -0.2874,  0.8236,  0.3090,  0.6382, -0.1096,
           0.7588,  0.5967, -1.2729, -2.2114,  1.2830, -0.6859, -0.4435,
          -0.6575, -0.534

In [66]:
B, T, C = x_emb.shape
head_size = 2
key = nn.Linear(C,head_size, bias=False, device=device)
query = nn.Linear(C,head_size, bias=False, device=device)
value = nn.Linear(C, head_size, device=device)

In [67]:
k = key(x_emb)  # (B,T,C) @ (C, H) -> (B,T,H)
print(k)
q = query(x_emb)  # (B,T,C) @ (C, H) -> (B,T,H)
print(q)
print(k.shape, q.shape)
weight = q @ k.transpose(-1,-2) * head_size**-0.5  # (B,T,H) @ (B,H,T) -> (B,T,T)
print(weight)
tril = torch.tril(torch.ones(T,T)).to(device)
print(tril)
weight = weight.masked_fill(tril == 0, float('-inf'))
print(weight)
weight = F.softmax(weight, 2)  # (B,T,T) -> (B,T,T_mean)
print(weight)
v = value(x_emb)   # (B,T,C) @ (C, H) -> (B,T,H)
print(v)
out = weight @ v   # (B,T,T_mean) @ (B,T,H) -> (B,T,H)
print(out.shape)  # B * T * H

tensor([[[-0.3254,  0.1134],
         [-1.1443, -0.6934],
         [ 1.4930, -0.3336]]], device='cuda:0', grad_fn=<UnsafeViewBackward0>)
tensor([[[-0.3669, -0.6379],
         [-1.4149, -0.0669],
         [-0.9649, -0.1618]]], device='cuda:0', grad_fn=<UnsafeViewBackward0>)
torch.Size([1, 3, 2]) torch.Size([1, 3, 2])
tensor([[[ 0.0332,  0.6096, -0.2368],
         [ 0.3202,  1.1777, -1.4779],
         [ 0.2090,  0.8601, -0.9805]]], device='cuda:0',
       grad_fn=<MulBackward0>)
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]], device='cuda:0')
tensor([[[ 0.0332,    -inf,    -inf],
         [ 0.3202,  1.1777,    -inf],
         [ 0.2090,  0.8601, -0.9805]]], device='cuda:0',
       grad_fn=<MaskedFillBackward0>)
tensor([[[1.0000, 0.0000, 0.0000],
         [0.2979, 0.7021, 0.0000],
         [0.3104, 0.5952, 0.0945]]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[[-0.1688, -0.2627],
         [-1.0833, -0.4400],
         [ 0.3279,  0.2186]]], device='cuda:0'