In [323]:
## Hyperparameters
learning_rate = 3e-4
num_iter = 5000
dropout = 0.2
num_layers = 6
embed_size = 256
num_heads = 4

In [324]:
with open("input.txt") as f:
    text = f.read()

In [325]:
print(text[:400])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it 


In [326]:
print("Total length of this dataset: ", len(text))

Total length of this dataset:  1115394


In [367]:
chars = sorted(list(set(text)))
print("".join(chars))
vocab_size = len(chars)
print("Vocab_size: ", vocab_size)
print(chars)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab_size:  65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [328]:
token_to_int = {chr : i for i, chr in enumerate(chars)}
int_to_token = {i : chr for i, chr in enumerate(chars)}

token_to_int['A'], int_to_token[13]

(13, 'A')

In [329]:
encode = lambda s: [token_to_int[c] for c in list(s)]
decode = lambda l: ''.join([int_to_token[i] for i in l])

In [330]:
encode("Once uponce a time")

[27, 52, 41, 43, 1, 59, 54, 53, 52, 41, 43, 1, 39, 1, 58, 47, 51, 43]

In [331]:
decode([27, 52, 41, 43, 1, 59, 54, 53, 52, 41, 43, 1, 39, 1, 58, 47, 51, 43])

'Once uponce a time'

In [332]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
data = torch.tensor(encode(text))
print("data shape: ", data.shape)
print("data dtype: ", data.dtype)

data shape:  torch.Size([1115394])
data dtype:  torch.int64


In [333]:
data[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [334]:
# Split data into train set and validation set
n = int(len(data) * 0.9)
train_data = data[:n]
valid_data = data[n:]

In [335]:
block_size = 16
data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43])

In [336]:
x_data = data[:block_size]
y_data = data[1:block_size + 1]
for t in range(block_size):
    x = x_data[0:t + 1]
    y = y_data[t]
    print(f"With input = {x}, the output should be = {y}")

With input = tensor([18]), the output should be = 47
With input = tensor([18, 47]), the output should be = 56
With input = tensor([18, 47, 56]), the output should be = 57
With input = tensor([18, 47, 56, 57]), the output should be = 58
With input = tensor([18, 47, 56, 57, 58]), the output should be = 1
With input = tensor([18, 47, 56, 57, 58,  1]), the output should be = 15
With input = tensor([18, 47, 56, 57, 58,  1, 15]), the output should be = 47
With input = tensor([18, 47, 56, 57, 58,  1, 15, 47]), the output should be = 58
With input = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]), the output should be = 47
With input = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]), the output should be = 64
With input = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64]), the output should be = 43
With input = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43]), the output should be = 52
With input = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52]), the output should be = 10


In [337]:
torch.manual_seed(123)
batch_size = 32 # how many indipendence sequence we will process in parallel
block_size = 256     # the maximum length for prediction

def get_batch(split):
    """
        Generate a small batch of data with input x and target y.
    """
    data = train_data if (split == 'train') else valid_data
    index = torch.randint(
        high = len(data) - block_size - 1, size = (batch_size,)
    )
    x = torch.stack([data[i : i + block_size] for i in index])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in index])

    x = x.to(device)
    y = y.to(device)
    return x, y

In [338]:
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([32, 256])
tensor([[ 0, 24, 43,  ..., 57, 59, 50],
        [45, 53, 42,  ..., 59, 54,  1],
        [52, 57, 58,  ..., 43,  1, 59],
        ...,
        [18, 47, 56,  ..., 19, 13, 30],
        [ 1, 57, 43,  ..., 53, 59, 40],
        [ 0, 14, 59,  ..., 25, 21, 24]], device='cuda:0')
targets:
torch.Size([32, 256])
tensor([[24, 43, 57,  ..., 59, 50, 58],
        [53, 42, 57,  ..., 54,  1, 39],
        [57, 58, 56,  ...,  1, 59, 57],
        ...,
        [47, 56, 57,  ..., 13, 30, 17],
        [57, 43, 39,  ..., 59, 40, 58],
        [14, 59, 58,  ..., 21, 24, 24]], device='cuda:0')
----
when input is [0] the target: 24
when input is [0, 24] the target: 43
when input is [0, 24, 43] the target: 57
when input is [0, 24, 43, 57] the target: 58
when input is [0, 24, 43, 57, 58] the target: 1
when input is [0, 24, 43, 57, 58, 1] the target: 58
when input is [0, 24, 43, 57, 58, 1, 58] the target: 46
when input is [0, 24, 43, 57, 58, 1, 58, 46] the target: 53
when input is [0, 24

In [339]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(123)

<torch._C.Generator at 0x25727a5d790>

### The mathematical trick for self attention

In [340]:
# Task: Give a matrix (batch_size, seq_len, num_channels), calculate the matrix b which b[B][T] = mean(x[B][T][..])
B, T, C = 2, 6, 4
x = torch.randint(low = 0, high = 10, size = (B, T, C), dtype = torch.float)

# The inefficient way
b1 = torch.zeros(size = (B, T, C), dtype = torch.float32)
for b in range(B):
    for t in range(T):
        b_part = x[b, :t + 1] # (t, num_channels)
        b1[b][t] = torch.mean(b_part, 0)
print(x)
print(b1)

tensor([[[2., 9., 2., 0.],
         [0., 2., 6., 7.],
         [9., 4., 1., 1.],
         [6., 1., 2., 9.],
         [4., 1., 3., 0.],
         [0., 6., 5., 7.]],

        [[9., 2., 1., 6.],
         [0., 6., 6., 5.],
         [3., 9., 4., 6.],
         [1., 8., 0., 5.],
         [4., 1., 6., 7.],
         [3., 1., 2., 2.]]])
tensor([[[2.0000, 9.0000, 2.0000, 0.0000],
         [1.0000, 5.5000, 4.0000, 3.5000],
         [3.6667, 5.0000, 3.0000, 2.6667],
         [4.2500, 4.0000, 2.7500, 4.2500],
         [4.2000, 3.4000, 2.8000, 3.4000],
         [3.5000, 3.8333, 3.1667, 4.0000]],

        [[9.0000, 2.0000, 1.0000, 6.0000],
         [4.5000, 4.0000, 3.5000, 5.5000],
         [4.0000, 5.6667, 3.6667, 5.6667],
         [3.2500, 6.2500, 2.7500, 5.5000],
         [3.4000, 5.2000, 3.4000, 5.8000],
         [3.3333, 4.5000, 3.1667, 5.1667]]])


In [341]:
# The efficient way (version 2)
tril = torch.tril(torch.ones(T, T))
tril = tril / tril.sum(dim = -1, keepdim=True)
b2 = tril @ x # (T, T) convert to (B, T, T) @ (B, T, C) -> (B, T, C)
print(x)
print(b2)

tensor([[[2., 9., 2., 0.],
         [0., 2., 6., 7.],
         [9., 4., 1., 1.],
         [6., 1., 2., 9.],
         [4., 1., 3., 0.],
         [0., 6., 5., 7.]],

        [[9., 2., 1., 6.],
         [0., 6., 6., 5.],
         [3., 9., 4., 6.],
         [1., 8., 0., 5.],
         [4., 1., 6., 7.],
         [3., 1., 2., 2.]]])
tensor([[[2.0000, 9.0000, 2.0000, 0.0000],
         [1.0000, 5.5000, 4.0000, 3.5000],
         [3.6667, 5.0000, 3.0000, 2.6667],
         [4.2500, 4.0000, 2.7500, 4.2500],
         [4.2000, 3.4000, 2.8000, 3.4000],
         [3.5000, 3.8333, 3.1667, 4.0000]],

        [[9.0000, 2.0000, 1.0000, 6.0000],
         [4.5000, 4.0000, 3.5000, 5.5000],
         [4.0000, 5.6667, 3.6667, 5.6667],
         [3.2500, 6.2500, 2.7500, 5.5000],
         [3.4000, 5.2000, 3.4000, 5.8000],
         [3.3333, 4.5000, 3.1667, 5.1667]]])


In [342]:
torch.allclose(b1, b2)

True

In [343]:
# Softmax version
tril = torch.tril(torch.ones(T, T))
weight = torch.zeros((T, T))
weight = weight.masked_fill(tril == 0, float('-inf')) # Can be considered that in timestep i, we do not consider word indexed from i + 1, i + 2, ...
print("weight after masked fill: \n", weight)
weight = F.softmax(weight, dim = 1)
print("weight after softmax: \n", weight)
b3 = weight @ x
print("b3 = ", b3)
print(torch.allclose(b1, b3))

weight after masked fill: 
 tensor([[0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0.]])
weight after softmax: 
 tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667]])
b3 =  tensor([[[2.0000, 9.0000, 2.0000, 0.0000],
         [1.0000, 5.5000, 4.0000, 3.5000],
         [3.6667, 5.0000, 3.0000, 2.6667],
         [4.2500, 4.0000, 2.7500, 4.2500],
         [4.2000, 3.4000, 2.8000, 3.4000],
         [3.5000, 3.8333, 3.1667, 4.0000]],

        [[9.0000, 2.0000, 1.0000, 6.0000],
         [4.5000, 4.0000, 3.5000, 5.5000],
         [4.

In [344]:
# self-attention
B, T, C = 4, 8, 32
x = torch.rand(size = (B, T, C))

head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias= False)

k = query(x) # B, T, 16
q = query(x) # B, T, 16
v = value(x) # B, T, 16

weight = q @ k.transpose(-2, -1) # B, T, T
tril = torch.tril(torch.ones(size = (T, T)))
weight = weight.masked_fill(tril == 0, float('-inf'))
weight = F.softmax(weight, dim = -1)

# Các hàng của weight cho biết: Tại hàng thứ i, từ thứ i sẽ có trọng số của các vector value của các từ từ 1 đến T. Các trọng số này sẽ nhân lần lượt với vector value tương ứng
# để ra được biểu diễn tốt hơn của từ thứ i.
# Lưu ý rằng các vector value được sắp xếp theo hàng ngang thay vì hàng dọc!

out = weight @ v # B, T, 16

In [345]:
out.shape

torch.Size([4, 8, 16])

In [346]:
weight.shape # B, T, T

torch.Size([4, 8, 8])

In [347]:
weight

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4783, 0.5217, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2652, 0.2519, 0.4829, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2503, 0.2073, 0.2479, 0.2944, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2191, 0.1694, 0.2362, 0.1743, 0.2010, 0.0000, 0.0000, 0.0000],
         [0.1809, 0.1642, 0.2008, 0.1410, 0.1693, 0.1438, 0.0000, 0.0000],
         [0.1475, 0.1447, 0.1905, 0.1550, 0.1140, 0.0776, 0.1707, 0.0000],
         [0.1350, 0.1201, 0.1523, 0.1241, 0.1273, 0.0840, 0.1149, 0.1423]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2560, 0.7440, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1948, 0.3484, 0.4568, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1397, 0.1721, 0.2185, 0.4698, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1330, 0.2334, 0.1721, 0.2321, 0.2294, 0.0000, 0.0000, 0.0000],
         [0.1174, 0.240

Sự khác biệt giữa self-attention và cross-attention: self-attention lấy key, query, value từ cùng một nguồn. cross-attention thì có thể lấy từ nhiều nguồn:
vd: query lấy từ dữ liệu ảnh, key lấy từ text, value lấy từ âm thanh.

tại sao lại chia cho sqrt(head_size)?

In [348]:
k = torch.rand(B, T, head_size)
q = torch.rand(B, T, head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

# do có hai ma trận nhân với nhau, có thể làm cho xuất hiện những số rất lớn, và những số rất bé -> softmax làm cho ma trận sau khi nhân có variance lớn
# chuẩn hóa.
k.var(), q.var(), wei.var()

(tensor(0.0865), tensor(0.0842), tensor(0.0527))

In [349]:
dropout = 0.2
n_layers = 6

In [350]:
class Head(nn.Module):
    def __init__(self, sequence_size, embed_size, head_size, dropout = dropout):
        super().__init__()
        self.sequence_size = sequence_size
        self.head_size = head_size
        self.embed_size = embed_size
        self.query = nn.Linear(embed_size, head_size, bias = False)
        self.key = nn.Linear(embed_size, head_size, bias = False)
        self.value = nn.Linear(embed_size, head_size, bias = False)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('tril', torch.tril(torch.ones(sequence_size, sequence_size)))

    def forward(self, embed):
        B, T, C = embed.shape
        # print("C: ", C)
        # print("embed_size: ", self.embed_size)
        # assert T == self.sequence_size and C == self.embed_size
        q, k, v = self.query(embed), self.key(embed), self.value(embed)
        wei = q @ k.transpose(-2, -1) / (self.head_size ** 0.5)
        wei = wei.masked_fill(self.tril == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1)
        wei = self.dropout(wei)
        out = wei @ v
        return out


In [351]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, sequence_size, embed_size, head_size, dropout = dropout):
        super().__init__()
        self.heads = nn.ModuleList([Head(sequence_size, embed_size, head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(head_size * num_heads, embed_size)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(out)
        return self.projection(out)
        
        

In [352]:
class FeedForward(nn.Module):
    def __init__(self, embed_size, dropout = dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size), # projection layer
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

In [353]:
class Block(nn.Module):
    def __init__(self, embed_size, sequence_size, num_heads):
        super().__init__()
        head_size = embed_size // num_heads
        self.sa = MultiHeadAttention(num_heads,sequence_size, embed_size, head_size)
        self.ffwd = FeedForward(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)
    
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x
        

In [354]:
E = 32
test_head = Head(8, E, 16).to(device)
test_head(
    torch.rand(4, 8, E, device = device)
).shape

torch.Size([4, 8, 16])

In [355]:
# test_multihead = MultiHeadAttention(
#     num_heads=4,
#     sequence_size=block_size,
#     embed_size=100,
#     head_size=120
# )

# test_multihead(
#     torch.rand(size = (2, 8, 100))
# ).shape

In [356]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# torch.manual_seed(123)
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, sequence_size, embed_size, num_heads = 4, head_size = None, n_layers = n_layers):
        super().__init__()
        self.vocab_size = vocab_size
        self.sequence_size = sequence_size
        self.embed_size = embed_size
        self.head_size = head_size
        if head_size is None: 
            head_size = embed_size // num_heads 

        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        self.positional_embedding_table = nn.Embedding(sequence_size, embed_size)
        
        # self.sa_head = MultiHeadAttention(num_heads, sequence_size, embed_size, head_size)
        # self.ffwd = FeedForward(num_heads * head_size)

        # self.block = nn.Sequential(
        #     Block(embed_size, sequence_size, num_heads=4),
        #     Block(embed_size, sequence_size, num_heads=4),
        #     Block(embed_size, sequence_size, num_heads=4), 
        #     Block(embed_size, sequence_size, num_heads=4),
        #     Block(embed_size, sequence_size, num_heads=4), 
        #     Block(embed_size, sequence_size, num_heads=4),
        #     nn.LayerNorm(embed_size)
        # )

        self.blocks = nn.Sequential(
            *[Block(embed_size, sequence_size, num_heads) for _ in range(n_layers)] 
        )
        self.final_ln = nn.LayerNorm(embed_size)

        self.mlp = nn.Linear(num_heads * head_size, vocab_size)


    
    def forward(self, idx, targets = None):
        word_embed = self.token_embedding_table(idx)
        # pos_embed = self.positional_embedding_table(torch.arange(idx.shape[1], device = device))
        pos_embed = self.positional_embedding_table(torch.arange(self.sequence_size, device = device))

        embed = self.blocks(word_embed + pos_embed)
        embed = self.final_ln(embed)
        logits = self.mlp(embed)

        if targets is None:
            loss = None
        else:
            batch_size, seq_len, vocab_size = logits.shape
            logits = logits.reshape(batch_size * seq_len, vocab_size)
            targets = targets.reshape(batch_size * seq_len)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # shape of idx: (batch_size, seq_len)
        
        if (idx.shape[1] < self.sequence_size):
            pad_size = (idx.shape[0], self.sequence_size - idx.shape[1])
            padding = torch.zeros(size = pad_size, device = device, dtype = idx.dtype)
            idx = torch.cat((padding, idx), dim = 1)    
        
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.sequence_size:]
            logits, loss = self.forward(idx_cond)
            # focus only on the last timestep
            logits = logits[:, -1, :] # batch_size, vocab_size
            probs = F.softmax(logits, dim = -1) # batch_size, vocab_size
            idx_next = torch.multinomial(probs, num_samples= 1)
            idx = torch.cat([idx, idx_next], dim = 1) # batch_size, seq_len + 1
        return idx



In [357]:
m = BigramLanguageModel(vocab_size, block_size, embed_size, num_heads).to(device)

logits, loss = m(
    torch.randint(low = 0, high = 10, size = (4, block_size), device = device)
)
print(logits.shape)
print(loss)

torch.Size([4, 256, 65])
None


In [358]:
m

BigramLanguageModel(
  (token_embedding_table): Embedding(65, 256)
  (positional_embedding_table): Embedding(256, 256)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (query): Linear(in_features=256, out_features=64, bias=False)
            (key): Linear(in_features=256, out_features=64, bias=False)
            (value): Linear(in_features=256, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (projection): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1024, out_features=256, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((256,), eps=1e-05, element

In [359]:
# optimizer = torch.optim.AdamW(params = m.parameters(), lr = learning_rate)

# for step in range(num_iter):
#     xb, yb = get_batch('train')
#     logits, loss = m(xb, yb)
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
 
#     xb, yb = get_batch('val')
#     with torch.no_grad():
#         _, valid_loss = m(xb, yb)


#     if (step % 200 == 0):
#         print(f"Step: {step} | training_loss: {loss.item()} | valid_loss: {valid_loss.item()}")

In [360]:
# print(decode(m.generate(idx = torch.ones(size = (1, block_size), dtype = torch.long, device=device), max_new_tokens=10000)[0].cpu().numpy()))

In [361]:
m.load_state_dict(torch.load('check_point.pkl', weights_only=True))

<All keys matched successfully>

In [362]:

# torch.save(m.state_dict(), 'check_point.pkl')

In [None]:
print(decode(m.generate(idx = torch.ones(size = (1, block_size), dtype = torch.long, device=device), max_new_tokens=1000)[0].cpu().numpy()))
































































































































































































































































 BOLAUCIOLUS:
YOSCOIFINIUS:
PEENVEith:
Does give no well, Do it? sisterrife:
Peace:
Tybalt, in Aufidia this execution
To him, himself. Cashurp, for the feace of him!

ROMEO:
What of my lord?

LUMEO:
The strangen. Whency!

CAMILLONT:
Yes, I well; but sence the solet shall be:
Will what do, nor entrain, you well,
Where ceyple, so say much comfortains:
Nove them ha beatten admispointed persuadence
When I in such pity is themends,
No beaws, as I said, that for two you'lls
So forme be withing of you, as these seas I was
So: I am turn pepety: I 'laim, to-deen: may love, his
yours, more now has basen?
I am soldier them what show the crown tronough to
mercy cannots for a church'd slepwife?

ESCALUMNI:
He's singless not 'long that place it is c