In [48]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-04-07 01:26:39--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-04-07 01:26:39 (150 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [49]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [50]:
print("Length of dataset in characters: ", len(text))

Length of dataset in characters:  1115394


In [51]:
print(text[:50])

First Citizen:
Before we proceed any further, hear


In [52]:
# create the character vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [53]:
# tokenize the input text; characters into a reference integer; contrast with a word segment encode-decoder (tiktoken, sentenpiece)
#encoder and decoder for tokens
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encodes an input string into a list of integers
decode = lambda l: ''.join(itos[i] for i in l) # decodes an input list into a string

print(encode("Hello"))
print(decode(encode("Hello")))

[20, 43, 50, 50, 53]
Hello


In [54]:
# wrap the encoded text in a tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [55]:
# train validation split; 90/10
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

## Training model:

In [56]:
# illustration of the training blocks
block_size = 8

x = train_data[:block_size]
y = train_data[1: block_size + 1]

for t in range(block_size):
    context = x[:t + 1] # the target value is included in the context, makes the 
    # transformer more robust by seeing varied size contexts
    target = y[t]
    print(f"when input is {context} the target is {target}")

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [57]:
# create the blocks for encoding
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will be processed in parallel?
block_size = 8 # maximum context length for predictions

def get_batch(split):
    #generalize a small batch of data into inputs x and y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # random index 
    # numbers selected from 0 to (len(data) - block_size); subtracint block_size
    # ensures the ability to read in an entire block - starting indices
    
    x = torch.stack([data[i: i + block_size] for i in ix])
    y = torch.stack([data[i+1: i + block_size + 1] for i in ix])
    return x,y

In [58]:
xb, yb = get_batch('train')
print("Inputs: ")
print(xb.shape)
print(xb)
print("Targets: ")
print(yb.shape)
print(yb)

Inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [59]:
# each batch has 32 independent targets with the following input x

for b in range(batch_size): # batch dimension
    for t in range(block_size):  # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target is: {target}")

when input is [24] the target is: 43
when input is [24, 43] the target is: 58
when input is [24, 43, 58] the target is: 5
when input is [24, 43, 58, 5] the target is: 57
when input is [24, 43, 58, 5, 57] the target is: 1
when input is [24, 43, 58, 5, 57, 1] the target is: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target is: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target is: 39
when input is [44] the target is: 53
when input is [44, 53] the target is: 56
when input is [44, 53, 56] the target is: 1
when input is [44, 53, 56, 1] the target is: 58
when input is [44, 53, 56, 1, 58] the target is: 46
when input is [44, 53, 56, 1, 58, 46] the target is: 39
when input is [44, 53, 56, 1, 58, 46, 39] the target is: 58
when input is [44, 53, 56, 1, 58, 46, 39, 58] the target is: 1
when input is [52] the target is: 58
when input is [52, 58] the target is: 1
when input is [52, 58, 1] the target is: 58
when input is [52, 58, 1, 58] the target is: 46
when input is [52, 58, 1, 58, 46

In [60]:
# bigram language model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x7fb19a357b10>

In [61]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensors of integers
        logits = self.token_embedding_table(idx) # (B,T,C); batch, time, channel tensor
        # time is the block size (8) and channel is the vocab size (65)

        if targets is None:
            loss = None
        else:
            # function expects (B, T)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        # logits = score for the next character in the sequence 
        return logits, loss



    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context

        for _ in range(max_new_tokens):

            # get predictions
            logits, loss = self(idx)

            # focus only on the last time step
            logits = logits[:, -1,:] # becomes (B, C)

            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) # (B, C)

            # sample from the distribution 
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [62]:
m = BigramLanguageModel(vocab_size)

In [63]:
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [64]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [65]:
# PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [66]:
batch_size = 32
for steps in range(10000):
    # sample a batch of the data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5727508068084717


In [67]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=300)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht 


# Mathematical trick for self-attention

In [68]:
# example

torch.manual_seed(1337)
B,T,C = 4, 8, 2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

## ( 1 )

In [69]:
# set up so that T tokens are communicating; by previous context to the current time step
# average channels that preceed the given token -- lossy, no info on distribution

# want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))  # bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [70]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [71]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [72]:
# more efficient matrix operation

torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True) # normalizes the rows --> averaging effect 
    # on multiplication for preceeding rows
b = torch.randint(0, 10,(3,2)).float()
c = a @ b
print('a = ')
print(a)
print('--------')
print('b = ')
print(b)
print('--------')
print('c = ')
print(c)

a = 
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--------
b = 
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--------
c = 
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [73]:
# triangle shaped after T, used to produce running averages of the preceeding rows (as per cell 1)

wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [74]:
xbow2 = wei @ x # (B, T, T) @ (B, T, C) --> batch elements managed per batch --> (T, T) @ (T, C) --> (B, T, C)
# identical to xbow
xbow2[0], xbow[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [75]:
# third version
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))  # creates -inf wherever tril is 0
wei = F.softmax(wei, dim=-1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [76]:
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [77]:
n_embed = 32

In [78]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()

        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shaped

        # idx and targets are both (B, T) tensors of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C); batch, time, channel tensor
        # time is the block size (8) and channel is the vocab size (65)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # function expects (B, T)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        # logits = score for the next character in the sequence 
        return logits, loss



    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context

        for _ in range(max_new_tokens):

            # get predictions
            logits, loss = self(idx)

            # focus only on the last time step
            logits = logits[:, -1,:] # becomes (B, C)

            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) # (B, C)

            # sample from the distribution 
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

## Self-Attention

In [79]:
# version 4: Self-attention

torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

In [80]:
# this is self-attention because the keys, queries and values are all derived from the same x, same nodes 
# communicate

# each node will emit a query and key; this develops what the node has an affinity for based on the training data
# a vowel looking for a consonant in it's past
# query - what am I looking for?
# key - what do I contain?
# wei is the dot product of queries and keys --> high shows affinity

# a single head
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T); T, T matrix are the affinities


tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf')) # this is the decoder block that ensures
    # that nodes from the future do not send information to nodes in the past
    # deleting this block will enable the encoder block
wei = F.softmax(wei, dim=-1)

v = value(x) # v is the feature that communicates with the other nodes about the 
             # internal information
out = wei @ v

#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [81]:
# each batch element has a unique weight
wei[0]  # strength of number in the columns indicates which value the vector has an affinity for

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [82]:
# x is private information for the token; what is shared in the network are the values v and weights based
# on the affinity of values within the vectors 

# specific to self-attention
# hyperparameters
# batch_size = 16 # how many independent sequences will we process in parallel?
# block_size = 32 # what is the maximum context length for predictions?
# max_iters = 5000
# eval_interval = 100
# learning_rate = 1e-3
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# eval_iters = 200
# n_embed = 64
# n_head = 4
# n_layer = 4
# dropout = 0.0

In [83]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [84]:
class Head(nn.Module):
    """ one head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias = False)
        self.query = nn.Linear(n_embed, head_size, bias = False)
        self.value = nn.Linear(n_embed, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # (B,T,C)
        q = self.query(x) # (B, T, C)

        # compute attendion scores (affinities between nodes)
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B,T,C) @ (B,C,T) --> (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T)
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) --> (B,T,C)
        return out



In [85]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList((Head(head_size) for _ in range(num_heads)))
        self.proj = nn.Linear(n_embed, n_embed) # reincorporating the skip/residual layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [86]:
class FeedForward(nn.Module):
    """linear layer followed by non-linearity"""

    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed), # residual layer projected back into original path
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [87]:
class Block(nn.Module):
    """Transformer block: communication followed by computation"""

    def __init__(self, n_embed, n_head):
        # n_embed: embedding dimension, n_head: the number of heads
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed) # layer norm with dimensions 
        self.ln2 = nn.LayerNorm(n_embed)

# residual/skip connections are the additional addition with x
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [88]:
# similar to pytorch batchnorm1d; this version is a layer norm

class LayerNorm:
    
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        #parameters trained with backpropagation
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdims=True) # batch mean; changed dim to 1 for columns
        xvar = x.var(1, keepdims=True) # batch variance; changed dim to 1 for columns
        xhat = (x - xmean)/torch.sqrt(xvar + self.eps) # normalize unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

In [89]:
# reference: https://arxiv.org/pdf/1512.03385.pdf for skip/residual connections
# greatest contribution at optimization

# dropout references: https://dl.acm.org/doi/pdf/10.5555/2627435.2670313 

In [90]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()

        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
#        self.sa_head = Head(n_embed)
        # self.sa_heads = MultiHeadAttention(4, n_embed//4) # 4 heads of 8 dimensional self-attention
        # self.ffwd = FeedForward(n_embed)
        # self.blocks = nn.Sequential(
        #     Block(n_embed, n_head = 4),
        #     Block(n_embed, n_head = 4),
        #     Block(n_embed, n_head = 4),
        #     nn.LayerNorm(n_embed),
        # )
        self.blocks = nn.Sequential(*[Block(n_embed, n_head = n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed) # final layer norm
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B, T) tensors of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C); batch, time, channel tensor
        # time is the block size (8) and channel is the vocab size (65)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
#        x = self.sa_heads(x)
        x = self.blocks(x)
#        x = self.ffwd(x) # (B,T,C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # function expects (B, T)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        # logits = score for the next character in the sequence 
        return logits, loss


# adjust the index to account for the block size
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context

        for _ in range(max_new_tokens):
            # crop
            idx_cond = idx[:, -block_size:]

            # get predictions
            logits, loss = self(idx_cond)

            # focus only on the last time step
            logits = logits[:, -1,:] # becomes (B, C)

            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=1) # (B, C)

            # sample from the distribution 
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [91]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


In [92]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [93]:
# train model


model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))



10.788929 M parameters
step 0: train loss 4.5469, val loss 4.5451
step 500: train loss 2.1201, val loss 2.1695
step 1000: train loss 1.6764, val loss 1.8398
step 1500: train loss 1.5021, val loss 1.6969
step 2000: train loss 1.3953, val loss 1.6065
step 2500: train loss 1.3252, val loss 1.5577
step 3000: train loss 1.2688, val loss 1.5290
step 3500: train loss 1.2246, val loss 1.5032
step 4000: train loss 1.1889, val loss 1.4920
step 4500: train loss 1.1487, val loss 1.4791
step 4999: train loss 1.1188, val loss 1.4769

Your wonds have: I will seek then will preten;
And Catals, like our ancient's sun,
Or boaring with took in the hour,
Then his doth to mean open end mour slavior with:
Go, all her, and I'll sleep: I heard of them,
And what would have, do't but dry clost? O neederflaude
Is great the tell? He, cousin father, was done!
My step and sour, or rought and more forget
That any fellows looks reises, and his mispring, well'd
And leave is with this common I'ld redeceive him indeed.


In [96]:
print(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))


Lord Angelo his fresh king parth.

DUCHESS:
You will kill him, and whom it did;
I shall see his worle harm with you our harm;
Yet not goe into his good for ears to me.
Sweet-swear joints at I know that hear me,
Come ye withal be put to added
His conclaims against your ball,
Thedoching was a great's considerer,
Since would conserved your wranches and feather.
At shall this slews from which Caesar's
blanchesom's friendly, have with him, and the
reofling blastage:
He company set, is a twicked by drawn'd my grants;
set lady deliver for choicioys,
and thele from out heavy away in head!

HORTMIONE:
What and help!
Call his younds?
Hath he liked thee more lieved to be proceders!
This integratest honour metal's view?
That often the womb of my marriage, gives man.
Do not palward Henry half, fear and cot us;
Since, or go in hastistraction, that and Jord's fruita's death,
Because his swallor be bridale, fhilstly sire;
I saw' both kingdom ere long mysire
In plottle deep my spires.

ROMEO:
Justice,