## Import modules

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 

In [2]:
import torch # lets use pytorch
import tiktoken 
enc = tiktoken.get_encoding("gpt2")
import torch.nn as nn
from torch.nn import functional as F

## Hyperparameters

In [3]:
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

#384 // 6 = 64 dimensional as standard for each head
n_embd = 384
n_head = 6

n_layer = 6
dropout = 0.2

## Import Data

In [27]:
with open('Data/tiny_shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [28]:
print("Length of the dataset is ", len(text))

Length of the dataset is  1115394


#### First 1,000 chars

In [6]:
print(text[:1000])


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
vocab = ''.join(chars) 

print(f'This dataset includes the following characters:{vocab}\n')
print(f'This dataset has {vocab_size} unique characters')

This dataset includes the following characters:
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

This dataset has 65 unique characters


## Tokenizer

In [8]:
# Create a mapping from chars to integers
stoi = {char:i for i,char in enumerate(chars)} #based on index in our sorted array of unique chars, assign number to each character in dictionary (for encoding)
itos =  {i:char for i,char in enumerate(chars)} #do the same thing but have index as key and char as value (for decoding)

encode = lambda s: enc.encode(s) # given string s, return an array of ints that pertain to each character
decode = lambda l: enc.decode(l) # given array of integers, decode into chars using itos and turn into string

print(encode("My name is Ethan"))
print(decode(encode("My name is Ethan")))




[3666, 1438, 318, 28926]
My name is Ethan


In [9]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([338025]) torch.int64
tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248,
          461,    11,  2740,    13,   198,   198,  5962, 22307,    25,   198,
         1639,   389,   477, 12939,  2138,   284,  4656,   621,   284,  1145,
          680,    30,   198,   198,  3237,    25,   198,  4965,  5634,    13,
        12939,    13,   198,   198,  5962, 22307,    25,   198,  5962,    11,
          345,   760,   327,  1872,   385,  1526, 28599,   318,  4039,  4472,
          284,   262,   661,    13,   198,   198,  3237,    25,   198,  1135,
          760,   470,    11,   356,   760,   470,    13,   198,   198,  5962,
        22307,    25,   198,  5756,   514,  1494,   683,    11,   290,   356,
         1183,   423, 11676,   379,   674,   898,  2756,    13,   198,  3792,
          470,   257, 15593,    30,   198,   198,  3237,    25,   198,  2949,
          517,  3375,   319,   

In [10]:
num_subwords = torch.unique(data, sorted=True)
vocab_size =  50258 # or try 50257
vocab_size

50258

## Separate into train and test validation split

In [11]:
n = int(0.9 * len(data)) # 90% train x 10% test split
train_data = data[:n]
val_data = data[n:]

## Set block size for chunking
train in chunks of n tokens at a time

In [12]:
#down the road we want the transformer to train at each subsequent token
#e.g. 18 -> 47, 18 + 47 -> 56, 18 + 47 + 56 -> 58, etc. (see next kernel)
train_data[:block_size+1]

tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248,
          461,    11,  2740,    13,   198,   198,  5962, 22307,    25,   198,
         1639,   389,   477, 12939,  2138,   284,  4656,   621,   284,  1145,
          680,    30,   198,   198,  3237,    25,   198,  4965,  5634,    13,
        12939,    13,   198,   198,  5962, 22307,    25,   198,  5962,    11,
          345,   760,   327,  1872,   385,  1526, 28599,   318,  4039,  4472,
          284,   262,   661,    13,   198,   198,  3237,    25,   198,  1135,
          760,   470,    11,   356,   760,   470,    13,   198,   198,  5962,
        22307,    25,   198,  5756,   514,  1494,   683,    11,   290,   356,
         1183,   423, 11676,   379,   674,   898,  2756,    13,   198,  3792,
          470,   257, 15593,    30,   198,   198,  3237,    25,   198,  2949,
          517,  3375,   319,   470,    26,  1309,   340,   307, 

In [13]:
#training like this helps with computational efficiency but also to help expose the transformer
#to more context from 1 - blocksize. Needs to get used to seeing everything in between 1-block size
#Should also be noted that transformer will NEVER predict based on series of tokens > blocksize (only from 1-bs)

x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(10):
    context = x[:t+1]
    target = y[t]
    print(f'when input is {context}, the target is: {target}')

when input is tensor([5962]), the target is: 22307
when input is tensor([ 5962, 22307]), the target is: 25
when input is tensor([ 5962, 22307,    25]), the target is: 198
when input is tensor([ 5962, 22307,    25,   198]), the target is: 8421
when input is tensor([ 5962, 22307,    25,   198,  8421]), the target is: 356
when input is tensor([ 5962, 22307,    25,   198,  8421,   356]), the target is: 5120
when input is tensor([ 5962, 22307,    25,   198,  8421,   356,  5120]), the target is: 597
when input is tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597]), the target is: 2252
when input is tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252]), the target is: 11
when input is tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11]), the target is: 3285


## Batch Dimensions
Going to have chunks of text encodings stacked up in a single tensor for efficiency

In [14]:
block_size = block_size

def get_batch(split: str):
    #generate small batch of data of inputs x with targets to predict y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # generate batch_size-sized random indices to get data from
    
    x = torch.stack([data[i:i+block_size] for i in ix]) # for each randomly selected index, get the associated context (up to block size) in the data
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # get offset by so x can try to predict it
    x, y = x.to(device), y.to(device)
    return x, y



@torch.no_grad() #don't update on backwards - more memory efficient
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean() #effectively average across many iterations of our training to smooth out loss
    model.train()
    return out


In [15]:
test_batch = get_batch('train')

In [16]:
torch.max(test_batch[0])

tensor(49766, device='cuda:0')

In [17]:
x_batch, y_batch = get_batch('train')
print('inputs:', x_batch.shape)
print(x_batch)

print('targets:', y_batch.shape)
print(y_batch, "\n\n----------\n")

# for batch in range(batch_size):
#     for time in range(block_size):
#         context = x_batch[batch,:time+1]
#         target = y_batch[batch,time]
#         print(f'When input is {context}, target is {target}.')

inputs: torch.Size([16, 256])
tensor([[  994,   284,  7284,  ...,  1961, 39743,    25],
        [  198,    39, 11262,  ...,   757,    25,   198],
        [ 3470,  7319, 35481,  ...,   198,   198, 11473],
        ...,
        [  198, 13828,   257,  ...,  6487, 22122,    13],
        [  465,  3656,  1194,  ...,    13,   198,   198],
        [  899,   455,   284,  ...,   550,   355,   300]], device='cuda:0')
targets: torch.Size([16, 256])
tensor([[  284,  7284, 14566,  ..., 39743,    25,   198],
        [   39, 11262, 20754,  ...,    25,   198,    40],
        [ 7319, 35481,    13,  ...,   198, 11473,  3843],
        ...,
        [13828,   257,  6735,  ..., 22122,    13,   198],
        [ 3656,  1194,    26,  ...,   198,   198, 49275],
        [  455,   284,   198,  ...,   355,   300,  2086]], device='cuda:0') 

----------



In [18]:
vocab_size

50258

## Define Language Model

In [19]:
class Head(nn.Module):
    """ One head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #creates the triangular matrix of 1s that makes our mask
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # B,T,16 (head size)
        q = self.query(x) # B,T,16 (head size)
        # Now we have B, T, 16 tensors where B and T are in parallel - no communication
        wei = q @ k.transpose(-2, -1) * C**-0.5 # B,T,16 @ B,16,T ----> B,T,T - Weighted aggregation now is a function of the keys and queries of these 
                                                # Scaled attention to preserve variance
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # B,T,T
        wei = self.dropout(wei)
        
        v = self.value(x) # what gets aggregated for the purpose of this single head
        out = wei @ v 
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList((Head(head_size) for _ in range(num_heads)))
        self.projection = nn.Linear(n_embd, n_embd) #linear transformation of the outcome of forward
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) #concatenate outputs for as many heads as we want working in parallel
        out = self.dropout(self.projection(out))
        return out
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self,n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), 
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # projection layer going back into residual pathway
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x) #makes it so that all tokens experience the activation functions independently
                           #self attention is the communication, and this feedforward is the opportunity for individual tokens to think on what they learned
    

class Block(nn.Module): 
    """ Transformer block: communication followed by computation """
    #intersperses comm. and comp.
    #communication done by the multiheaded self-attention and the computation done by the feedforward network on all tokens independently
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size) # n_head heads of head_size-dimensional self-attention
        self.ffwd = FeedForward(n_embd)
        self.layernorm1 = nn.LayerNorm(n_embd)
        self.layernorm2 = nn.LayerNorm(n_embd)

        
    def forward(self, x):
        #just adding blocks by themselves increases complexity / how deep the network is with basically the same results. Need to work on the optimization issues
        x = x + self.sa(self.layernorm1(x)) # 'x = x + 'allows for residual connections while layernorm 1 and 2 allow for normalized rows before self-attention and feedforward
        x = x + self.ffwd(self.layernorm2(x))
        
        
        return x
    


    
class BigramLanguageModel(nn.Module): #subclass of nn Module
    def __init__(self, vocab_size):
        super().__init__() #call parent's constructor
        # each token directly reads off the logits for the next token from lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # create vocab_size x vocab_size embedding table
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])   #intersperse communication and computation 3 times with the 3 blocks
        self.layernorm_final = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, index, targets=None):
        B, T = index.shape
        # print(torch.max(index))
        #index and targets are both (B,T) tensor of integers
        #when we pass an index here, every integer in our input will refer to the embedding table
        #and pluck out a corresponding row in the table according to its index
        #e.g. when we hand it 25 (the encoding of 'M'), it goes to row 25 in embedding table
        #then pytorch will arrange it into a Batch x Time x Channel tensor 
        # Hence, our logits end up as being 4 (batch_size for parallel processing) by 8 (# of context places / block_size) by 65 (vocab_size)
        #(B,T,C). Remember, logits are just log counts of a distribution
        #logits are our scores for the next character in the sequence
            
            
        tok_emb = self.token_embedding_table(index) #B,T,C
        pos_emb = self.position_embedding_table(torch.arange(T,device=device)) #T,C
        x = tok_emb + pos_emb # B,T,C
        x = self.blocks(x) #apply one head of self attention B,T,C
        x = self.layernorm_final(x)
        logits = self.lm_head(x) #B,T,vocab_size
        
        if targets is None:
            loss = None #because there's nothing to aim for
        
        else:
            #torch gets angry if we give it C in the third dimension so we need to reshape
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # reshape into a 32 x 65 tensor
            targets = targets.view(B*T)
            #need to evaluate the loss function
            # let's use negative log likelihood / crossentropy
            loss = F.cross_entropy(logits,targets) # how well are we predicting next character based on the logits?
                                                   # ideally, the correct dimension (point at 4,2,45 for example) should have a very high number while others are low
        return logits, loss

    #Essentially a 'predict' function that 'generates' new 
     #index is (Batchsize,Time) tensor of integers in current context
                                                    #max_new_tokens is max number of tokens to generate (?)
    def generate(self, index, max_new_tokens: int):
        for _ in range(max_new_tokens):
            
            #crop index to the last block_size tokens
            index_cond = index[:, -block_size:] # helps with the positional encoding - don't feed info we don't have access to
            
            #get predictions
            logits, loss = self(index_cond) # calls forward
            #focus on only last time step (want the most trained version of the model)
            logits = logits[:, -1, :] 
            #apply softmax to get probabilities (exponentiate to approximate counts then get proportions to approximate probabilities)
            probs = F.softmax(logits, dim=-1) # B,C
            #sample from the distribution to get next character
            index_next = torch.multinomial(probs, num_samples=1) # B, 1, in each batch dimension we have a single prediction of what comes next
            #append sampled index to running sequence given current context of what we've predicted before
            index = torch.cat((index,index_next),dim=1) # B, T+1
        return index
    


## Instantiate Model

In [20]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)



In [21]:
# print(decode(m.generate(index = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))
#Gives us garbage because our current model is only looking at the last character bc it's a bigram model
#Also not trained

## Let's Optimize and Train the Model

In [22]:
# Create PyTorch optimizer
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)

In [23]:
for steps in range(max_iters):
     # every once in a while evaluate the loss on train and val sets
    if steps % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {steps}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    #sample batch of data
    x_batch, y_batch = get_batch('train')
    #evaluate loss
    logits, loss = m(x_batch, y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step 0: train loss 10.9655, val loss 10.9779
step 500: train loss 4.4067, val loss 5.0260
step 1000: train loss 3.8065, val loss 4.8241
step 1500: train loss 3.3984, val loss 4.7467
step 2000: train loss 2.9876, val loss 4.8476
step 2500: train loss 2.6160, val loss 4.9518
step 3000: train loss 2.2462, val loss 5.1589
step 3500: train loss 1.8781, val loss 5.3735


KeyboardInterrupt: 

In [25]:
print(decode(m.generate(index = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=2000)[0].tolist()))


! heaven were not thy stone;
A pack up the head of pierce like death:
Nay, for less adieu; I'll this.
FRIAR LAURENCE: stand between
And stony entrance to thee where hast thou not pleased out.
What! the molehill for Rosaline canker change?

DUCHESS OF YORK:
Can sooniness, and rude, knee;
Not sleeping kill'd.

HENRY BOLINGBROKE:
What dreadful noise of him?

DUCHESS OF YORK:
Away, nor quarrel else?
As near, great as I love him that you on you;
I am too well in mercy:
Give me too horrible conceit presently,
And you can you beg of justice did displace:
Give me assurance with you in this fact,
And yet I do call it excellent.
Have you, brother being moved,
Who pass'd, good design, to his prince and himself to not before you!
NorLORD ROSS:
God will her Richard's eyes? will not do it so,

As I fear; therefore, not break the executioner wherein
Lest that this the air on thieves.
We do this break thine;
Or here we sto from the affection.

LORD WILLOUGHBY:
I would please the same is to speak
In hi

In [26]:
import os
torch.save(m.state_dict(), 'gpt-model-shakespeare-subword.pth')
torch.save(m.state_dict(), 'gpt-model-state-dict-shakespeare-subword.pth')

# Mathematical Trick in Self-Attention - 3 Methods
How Transformers differ from traditional LSTM / RNN models

In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,2 #batch, time, channels
#Review Batch is for parallel processing, time is the amount of context, channels are the characters that can be predicted

#For predicting, information should only flow from previous context to current. It should not take from future tokens in training (e.g. In the word 'Hamburger,'
#it shouldn't use 'u' as influence for choosing 'b')

#The easiest way for tokens to communicate is to do an average of all the preceding elements
#You can then add that average of previous context as a feature vector what you already know
#Recognize that an average is a very weak/lossy way of summarizing info, but the principle of summarizing what you already know into
#a number that represents previous context is key
x = torch.randn(B,T,C) #fill 4,8,2 tensor with random numbers
x.shape

### Self-Attention Mask 1 - Brute Force

In [None]:
# We want x[batch, time] = mean_{i<=t} x[b,i]
x_bag_of_words = torch.zeros((B,T,C))
for b in range(B): #iterate over batch dimensions
    for t in range(T): #iterate over time
        x_prev = x[b,:t+1] #(t,C)
        x_bag_of_words[b,t] = torch.mean(x_prev, 0) #averaging out previous x's over time

print(x[0]) 

In [None]:
print(x_bag_of_words[0])
#each element is an average of prev elements in tensor

### Self-Attention Mask 2 - Matrix Multiplication

In [None]:
# matrix multiplication basics
torch.manual_seed(42)
a = torch.ones(3,3)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)
print("Works by multiplying first row of a by first col of b and adding up (1*2 + 1*6 + 1*6 = 14), same thing for 16 (7+4+5) etc.")


In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3)) # gives lower triangular part of matrix
#as you progress down the matrix, you progressively ignore 1 less element of b due to the growing # of 1s and shrinking number of 0s
a /= torch.sum(a, 1, keepdim=True)
#now we are able to average the sums going down rows because for each row we are essentially multiplying it by 1/row_num (which is an average)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)


In [None]:
weights = torch.tril(torch.ones(T,T))
weights = weights / weights.sum(1, keepdim = True)
x_bag_of_words_2 = weights @ x # (B, T, T) @ (B, T, C) --> (B, T, C)
# xbow and xbow2 should be equal but this way is MUCH faster
# essentially we are doing weighted sums by using the triangular torch.tril so that we can only have the matrix access
# only the tokens preceding it

### Self-Attention Mask 3 - Using Softmax

In [None]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) #fill zeros with -inf
wei = F.softmax(wei, dim=-1) # exponentiate and average across each row -> end up with same matrix as the previous two methods
x_bag_of_words_3 = wei @ x
x_bag_of_words_3

# Self-Attention Decoder Block

In [None]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn((B,T,C))

#emit a query and a key vector
#Query vector - what am I looking for
#Key vector - what do i contain
#obtain affinities between vectors by effectively doing a dot product between the keys and the queries
# e.g. A query dot products with all the keys and that dot product becomes wei

#Single head that performs self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # B,T,16 (head size)
q = query(x) # B,T,16 (head size)
# Now we have B, T, 16 tensors where B and T are in parallel - no communication
wei = q @ k.transpose(-2, -1) # B,T,16 @ B,16,T ----> B,T,T - Weighted aggregation now is a function of the keys and queries of these nodes


tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T)) # don't want it to be all uniform because some tokens will have natural affinities for or against other tokens in the past (this currently doesn't do that)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x) # what gets aggregated for the purpose of this single head
out = wei @ v 
#out = wei @ x

out.shape

In [None]:
wei[0] #now wei is data dependent

### Notes
- Attention is a **communication mechanism.** Can be seen as nodes in a directed graph looking at each other and aggregating into a single value with a weighted sum from all nodes that point to them with data-dependent weights 
- There is no notion of space
- There is no communication across batch dimensions
- Attention block doesn't necessarily have to only communicate backwards (E.g. Token at Pos 5 will never look forward to Pos 6 to choose what token it should be). In some cases you may want to have all tokens talk to each other (e.g. sentiment analysis where future context words may shed a different light on previous ones). To create such an encoder block, all you have to do is delete wei = wei.masked_fill(tril == 0, float('-inf')), because that removes the mask of -inf and allows for aggregates at all Ts regardless of temporal position
    - What we have currently is called a decoder block
- There is also something called cross-attention. This block of code is called self-attention because the keys, queries, and values are all coming from the same source (x). However, there can be a case where your queries come from x but key and values come from another source. 
    - Called cross attention if there is a separate pool of nodes that we want to pool information from 
- Scaled self attention additionally divides wei by 1/sqrt(head_size) - this makes it so when input Query, Key are dot producted, the resulting variance will be preserved to approximately 1 as opposed to the order of head_size
    - Prevents softmax from being way too peaky because with multiplication it tends to optimize towards the max values, leading to high variance

# Layernorm Explanation

In [None]:
class LayerNorm1d: # (used to be BatchNorm1d)
    """ normalize rows to 1 stdev"""
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdim=True) # batch mean
        xvar = x.var(1, keepdim=True) # batch variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out
  
    def parameters(self):
        return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

In [None]:
x[0,:].mean(), x[0,:].std() # the ROWS are normalized to stdev of 1 and mean of 0 (normal distribution)

# Generate Infinite Shakespeare

In [None]:
model2 = torch.load('gpt-model-shakespeare.pth')

In [None]:
model_output = decode(model2.generate(index = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=2000)[0].tolist())

In [None]:
with open('shakespeare_output.txt', 'w', encoding='utf-8') as f:
    f.write(model_output)