## Import modules

In [1]:
import torch # lets use pytorch
import torch.nn as nn
from torch.nn import functional as F

## Hyperparameters

In [2]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

#384 // 6 = 64 dimensional as standard for each head
n_embd = 384
n_head = 6

n_layer = 6
dropout = 0.2

## Import Data

In [1]:
with open('Data/little-lovecraft.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print("Length of the dataset is ", len(text))

Length of the dataset is  4473648


#### First 1,000 chars

In [5]:
print(text[:1000])


High up, crowning the grassy summit of a swelling mound whose sides are wooded near the base
with the gnarled trees of the primeval forest, stands the old chateau of my ancestors. For centuries
its lofty battlements have frowned down upon the wild and rugged countryside about, serving
as a home and stronghold for the proud house whose honoured line is older even than the moss-grown
castle walls. These ancient turrets, stained by the storms of generations and crumbling under
the slow yet mighty pressure of time, formed in the ages of feudalism one of the most dreaded
and formidable fortresses in all France. From its machicolated parapets and mounted battlements
Barons, Counts, and even Kings had been defied, yet never had its spacious halls resounded to
the footsteps of the invader.
But since those glorious years all is changed. A poverty but little above the
level of dire want, together with a pride of name that forbids its alleviation by the pursuits
of commercial life, have prevented

### Find unique character and how many there are
Note that this is a character level tokenizer which is very simple and that in practice
people will use subword-level tokenization

In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
vocab = ''.join(chars) 

print(f'This dataset includes the following characters:{vocab}\n')
print(f'This dataset has {vocab_size} unique characters')

This dataset includes the following characters:
 !#$&'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz| ¡¢£°·½¿ÁÅÆÉÑÓ×áâäåæçèéêëíîïñóôöúüāēœΝΟΠΣΥαγδηικλμνπςτωἀἐἱἶὀὁ–—‘’“”•′″

This dataset has 152 unique characters


## Tokenizer

In [7]:
# Create a mapping from chars to integers
stoi = {char:i for i,char in enumerate(chars)} #based on index in our sorted array of unique chars, assign number to each character in dictionary (for encoding)
itos =  {i:char for i,char in enumerate(chars)} #do the same thing but have index as key and char as value (for decoding)

encode = lambda s: [stoi[c] for c in s] # given string s, return an array of ints that pertain to each character
decode = lambda l: ''.join([itos[i] for i in l]) # given array of integers, decode into chars using itos and turn into string

print(encode("My name is Ethan"))
print(decode(encode("My name is Ethan")))




[39, 79, 1, 68, 55, 67, 59, 1, 63, 73, 1, 31, 74, 62, 55, 68]
My name is Ethan


In [8]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([4473648]) torch.int64
tensor([34, 63, 61, 62,  1, 75, 70, 10,  1, 57, 72, 69, 77, 68, 63, 68, 61,  1,
        74, 62, 59,  1, 61, 72, 55, 73, 73, 79,  1, 73, 75, 67, 67, 63, 74,  1,
        69, 60,  1, 55,  1, 73, 77, 59, 66, 66, 63, 68, 61,  1, 67, 69, 75, 68,
        58,  1, 77, 62, 69, 73, 59,  1, 73, 63, 58, 59, 73,  1, 55, 72, 59,  1,
        77, 69, 69, 58, 59, 58,  1, 68, 59, 55, 72,  1, 74, 62, 59,  1, 56, 55,
        73, 59,  0, 77, 63, 74, 62,  1, 74, 62, 59,  1, 61, 68, 55, 72, 66, 59,
        58,  1, 74, 72, 59, 59, 73,  1, 69, 60,  1, 74, 62, 59,  1, 70, 72, 63,
        67, 59, 76, 55, 66,  1, 60, 69, 72, 59, 73, 74, 10,  1, 73, 74, 55, 68,
        58, 73,  1, 74, 62, 59,  1, 69, 66, 58,  1, 57, 62, 55, 74, 59, 55, 75,
         1, 69, 60,  1, 67, 79,  1, 55, 68, 57, 59, 73, 74, 69, 72, 73, 12,  1,
        32, 69, 72,  1, 57, 59, 68, 74, 75, 72, 63, 59, 73,  0, 63, 74, 73,  1,
        66, 69, 60, 74, 79,  1, 56, 55, 74, 74, 66, 59, 67, 59, 68, 74, 73,  1,
      

## Separate into train and test validation split

In [9]:
n = int(0.9 * len(data)) # 90% train x 10% test split
train_data = data[:n]
val_data = data[n:]

## Set block size for chunking
train in chunks of n tokens at a time

In [10]:
#down the road we want the transformer to train at each subsequent token
#e.g. 18 -> 47, 18 + 47 -> 56, 18 + 47 + 56 -> 58, etc. (see next kernel)
train_data[:block_size+1]

tensor([34, 63, 61, 62,  1, 75, 70, 10,  1, 57, 72, 69, 77, 68, 63, 68, 61,  1,
        74, 62, 59,  1, 61, 72, 55, 73, 73, 79,  1, 73, 75, 67, 67, 63, 74,  1,
        69, 60,  1, 55,  1, 73, 77, 59, 66, 66, 63, 68, 61,  1, 67, 69, 75, 68,
        58,  1, 77, 62, 69, 73, 59,  1, 73, 63, 58, 59, 73,  1, 55, 72, 59,  1,
        77, 69, 69, 58, 59, 58,  1, 68, 59, 55, 72,  1, 74, 62, 59,  1, 56, 55,
        73, 59,  0, 77, 63, 74, 62,  1, 74, 62, 59,  1, 61, 68, 55, 72, 66, 59,
        58,  1, 74, 72, 59, 59, 73,  1, 69, 60,  1, 74, 62, 59,  1, 70, 72, 63,
        67, 59, 76, 55, 66,  1, 60, 69, 72, 59, 73, 74, 10,  1, 73, 74, 55, 68,
        58, 73,  1, 74, 62, 59,  1, 69, 66, 58,  1, 57, 62, 55, 74, 59, 55, 75,
         1, 69, 60,  1, 67, 79,  1, 55, 68, 57, 59, 73, 74, 69, 72, 73, 12,  1,
        32, 69, 72,  1, 57, 59, 68, 74, 75, 72, 63, 59, 73,  0, 63, 74, 73,  1,
        66, 69, 60, 74, 79,  1, 56, 55, 74, 74, 66, 59, 67, 59, 68, 74, 73,  1,
        62, 55, 76, 59,  1, 60, 72, 69, 

In [11]:
#training like this helps with computational efficiency but also to help expose the transformer
#to more context from 1 - blocksize. Needs to get used to seeing everything in between 1-block size
#Should also be noted that transformer will NEVER predict based on series of tokens > blocksize (only from 1-bs)

x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input is {context}, the target is: {target}')

when input is tensor([34]), the target is: 63
when input is tensor([34, 63]), the target is: 61
when input is tensor([34, 63, 61]), the target is: 62
when input is tensor([34, 63, 61, 62]), the target is: 1
when input is tensor([34, 63, 61, 62,  1]), the target is: 75
when input is tensor([34, 63, 61, 62,  1, 75]), the target is: 70
when input is tensor([34, 63, 61, 62,  1, 75, 70]), the target is: 10
when input is tensor([34, 63, 61, 62,  1, 75, 70, 10]), the target is: 1
when input is tensor([34, 63, 61, 62,  1, 75, 70, 10,  1]), the target is: 57
when input is tensor([34, 63, 61, 62,  1, 75, 70, 10,  1, 57]), the target is: 72
when input is tensor([34, 63, 61, 62,  1, 75, 70, 10,  1, 57, 72]), the target is: 69
when input is tensor([34, 63, 61, 62,  1, 75, 70, 10,  1, 57, 72, 69]), the target is: 77
when input is tensor([34, 63, 61, 62,  1, 75, 70, 10,  1, 57, 72, 69, 77]), the target is: 68
when input is tensor([34, 63, 61, 62,  1, 75, 70, 10,  1, 57, 72, 69, 77, 68]), the target i

## Batch Dimensions
Going to have chunks of text encodings stacked up in a single tensor for efficiency

In [12]:
torch.manual_seed(1337)
# batch_size = 4 #how many independent sequences will be processed at once
block_size = block_size

def get_batch(split: str):
    #generate small batch of data of inputs x with targets to predict y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # generate batch_size-sized random indices to get data from
    
    x = torch.stack([data[i:i+block_size] for i in ix]) # for each randomly selected index, get the associated context (up to block size) in the data
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # get offset by so x can try to predict it
    x, y = x.to(device), y.to(device)
    return x, y



@torch.no_grad() #don't update on backwards - more memory efficient
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean() #effectively average across many iterations of our training to smooth out loss
    model.train()
    return out


In [13]:
x_batch, y_batch = get_batch('train')
print('inputs:', x_batch.shape)
print(x_batch)

print('targets:', y_batch.shape)
print(y_batch, "\n\n----------\n")

# for batch in range(batch_size):
#     for time in range(block_size):
#         context = x_batch[batch,:time+1]
#         target = y_batch[batch,time]
#         print(f'When input is {context}, target is {target}.')

inputs: torch.Size([64, 256])
tensor([[58,  1, 63,  ..., 74, 59, 68],
        [ 8, 82, 12,  ..., 65, 68, 69],
        [ 1, 74, 69,  ..., 68, 73, 62],
        ...,
        [55, 68, 69,  ..., 74, 62,  1],
        [73, 59,  1,  ..., 59, 72, 55],
        [69,  1, 73,  ..., 79, 12,  1]], device='cuda:0')
targets: torch.Size([64, 256])
tensor([[ 1, 63, 58,  ..., 59, 68, 74],
        [82, 12, 82,  ..., 68, 69, 77],
        [74, 69,  1,  ..., 73, 62, 63],
        ...,
        [68, 69, 74,  ..., 62,  1, 66],
        [59,  1, 57,  ..., 72, 55, 61],
        [ 1, 73, 62,  ..., 12,  1, 29]], device='cuda:0') 

----------



## Define Language Model

In [14]:
class Head(nn.Module):
    """ One head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #creates the triangular matrix of 1s that makes our mask
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # B,T,16 (head size)
        q = self.query(x) # B,T,16 (head size)
        # Now we have B, T, 16 tensors where B and T are in parallel - no communication
        wei = q @ k.transpose(-2, -1) * C**-0.5 # B,T,16 @ B,16,T ----> B,T,T - Weighted aggregation now is a function of the keys and queries of these 
                                                # Scaled attention to preserve variance
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1) # B,T,T
        wei = self.dropout(wei)
        
        v = self.value(x) # what gets aggregated for the purpose of this single head
        out = wei @ v 
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList((Head(head_size) for _ in range(num_heads)))
        self.projection = nn.Linear(n_embd, n_embd) #linear transformation of the outcome of forward
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) #concatenate outputs for as many heads as we want working in parallel
        out = self.dropout(self.projection(out))
        return out
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self,n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), 
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # projection layer going back into residual pathway
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x) #makes it so that all tokens experience the activation functions independently
                           #self attention is the communication, and this feedforward is the opportunity for individual tokens to think on what they learned
    

class Block(nn.Module): 
    """ Transformer block: communication followed by computation """
    #intersperses comm. and comp.
    #communication done by the multiheaded self-attention and the computation done by the feedforward network on all tokens independently
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size) # n_head heads of head_size-dimensional self-attention
        self.ffwd = FeedForward(n_embd)
        self.layernorm1 = nn.LayerNorm(n_embd)
        self.layernorm2 = nn.LayerNorm(n_embd)

        
    def forward(self, x):
        #just adding blocks by themselves increases complexity / how deep the network is with basically the same results. Need to work on the optimization issues
        x = x + self.sa(self.layernorm1(x)) # 'x = x + 'allows for residual connections while layernorm 1 and 2 allow for normalized rows before self-attention and feedforward
        x = x + self.ffwd(self.layernorm2(x))
        
        
        return x
    


    
class BigramLanguageModel(nn.Module): #subclass of nn Module
    def __init__(self, vocab_size):
        super().__init__() #call parent's constructor
        # each token directly reads off the logits for the next token from lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # create vocab_size x vocab_size embedding table
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])   #intersperse communication and computation 3 times with the 3 blocks
        self.layernorm_final = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, index, targets=None):
        B, T = index.shape
        
        #index and targets are both (B,T) tensor of integers
        #when we pass an index here, every integer in our input will refer to the embedding table
        #and pluck out a corresponding row in the table according to its index
        #e.g. when we hand it 25 (the encoding of 'M'), it goes to row 25 in embedding table
        #then pytorch will arrange it into a Batch x Time x Channel tensor 
        # Hence, our logits end up as being 4 (batch_size for parallel processing) by 8 (# of context places / block_size) by 65 (vocab_size)
        #(B,T,C). Remember, logits are just log counts of a distribution
        #logits are our scores for the next character in the sequence
            
            
        tok_emb = self.token_embedding_table(index) #B,T,C
        pos_emb = self.position_embedding_table(torch.arange(T,device=device)) #T,C
        x = tok_emb + pos_emb # B,T,C
        x = self.blocks(x) #apply one head of self attention B,T,C
        x = self.layernorm_final(x)
        logits = self.lm_head(x) #B,T,vocab_size
        
        if targets is None:
            loss = None #because there's nothing to aim for
        
        else:
            #torch gets angry if we give it C in the third dimension so we need to reshape
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # reshape into a 32 x 65 tensor
            targets = targets.view(B*T)
            #need to evaluate the loss function
            # let's use negative log likelihood / crossentropy
            loss = F.cross_entropy(logits,targets) # how well are we predicting next character based on the logits?
                                                   # ideally, the correct dimension (point at 4,2,45 for example) should have a very high number while others are low
        return logits, loss

    #Essentially a 'predict' function that 'generates' new 
     #index is (Batchsize,Time) tensor of integers in current context
                                                    #max_new_tokens is max number of tokens to generate (?)
    def generate(self, index, max_new_tokens: int):
        for _ in range(max_new_tokens):
            
            #crop index to the last block_size tokens
            index_cond = index[:, -block_size:] # helps with the positional encoding - don't feed info we don't have access to
            
            #get predictions
            logits, loss = self(index_cond) # calls forward
            #focus on only last time step (want the most trained version of the model)
            logits = logits[:, -1, :] 
            #apply softmax to get probabilities (exponentiate to approximate counts then get proportions to approximate probabilities)
            probs = F.softmax(logits, dim=-1) # B,C
            #sample from the distribution to get next character
            index_next = torch.multinomial(probs, num_samples=1) # B, 1, in each batch dimension we have a single prediction of what comes next
            #append sampled index to running sequence given current context of what we've predicted before
            index = torch.cat((index,index_next),dim=1) # B, T+1
        return index
    


## Instantiate Model

In [15]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)



In [16]:
#print(decode(m.generate(index = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))
#Gives us garbage because our current model is only looking at the last character bc it's a bigram model
#Also not trained

## Let's Optimize and Train the Model

In [17]:
# Create PyTorch optimizer
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)

In [18]:
for steps in range(max_iters):
     # every once in a while evaluate the loss on train and val sets
    if steps % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {steps}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    #sample batch of data
    x_batch, y_batch = get_batch('train')
    #evaluate loss
    logits, loss = m(x_batch, y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step 0: train loss 5.1395, val loss 5.1429
step 500: train loss 2.1761, val loss 2.2958
step 1000: train loss 1.7503, val loss 1.9322
step 1500: train loss 1.5665, val loss 1.7890
step 2000: train loss 1.4514, val loss 1.6967
step 2500: train loss 1.3832, val loss 1.6496
step 3000: train loss 1.3338, val loss 1.6042
step 3500: train loss 1.2959, val loss 1.5783
step 4000: train loss 1.2684, val loss 1.5553
step 4500: train loss 1.2471, val loss 1.5587
1.2834861278533936


In [19]:
print(decode(m.generate(index = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=2000)[0].tolist()))



which had led the visitors between faltered bfore than around
his ate world when the alley and maloned shocking of roddings were asleep craft.
Suddenly a solid and Gentilo no time with his perfuse again aeon-timidated unattedly void
his hand figure. The visitors along throng free and suggest McMman, and on the earrily
revolved an object, ravenous escaped. The sexton in road in the line torn the blackness were
ovorded to sense of his nonjecture aftering some likely that trages of utterly injustified marking
deaths.
He had been; since the small beheld dispelled you, and the partly librium of courtyard stories
and inter antedities; but his faculty that cosmicumber wimps he had spoked us, as however, and then
a Puth horrible feltation of troublescend—ivilogist, such was not read as a surpart, device-corrid,
welled allured by seadoning was nlight and ramparent, and had escaped me to find the body tessedit
beyond exist.
The night same—later cause was a sounds of one ship anciently dim befor

In [20]:
import os
torch.save(m, 'gpt-model-lovecraft.pth')
torch.save(m.state_dict(), 'gpt-model-state-dict-lovecraft.pth')

# Mathematical Trick in Self-Attention - 3 Methods
How Transformers differ from traditional LSTM / RNN models

In [30]:
torch.manual_seed(1337)
B,T,C = 4,8,2 #batch, time, channels
#Review Batch is for parallel processing, time is the amount of context, channels are the characters that can be predicted

#For predicting, information should only flow from previous context to current. It should not take from future tokens in training (e.g. In the word 'Hamburger,'
#it shouldn't use 'u' as influence for choosing 'b')

#The easiest way for tokens to communicate is to do an average of all the preceding elements
#You can then add that average of previous context as a feature vector what you already know
#Recognize that an average is a very weak/lossy way of summarizing info, but the principle of summarizing what you already know into
#a number that represents previous context is key
x = torch.randn(B,T,C) #fill 4,8,2 tensor with random numbers
x.shape

torch.Size([4, 8, 2])

### Self-Attention Mask 1 - Brute Force

In [20]:
# We want x[batch, time] = mean_{i<=t} x[b,i]
x_bag_of_words = torch.zeros((B,T,C))
for b in range(B): #iterate over batch dimensions
    for t in range(T): #iterate over time
        x_prev = x[b,:t+1] #(t,C)
        x_bag_of_words[b,t] = torch.mean(x_prev, 0) #averaging out previous x's over time

print(x[0]) 

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])


In [21]:
print(x_bag_of_words[0])
#each element is an average of prev elements in tensor

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


### Self-Attention Mask 2 - Matrix Multiplication

In [22]:
# matrix multiplication basics
torch.manual_seed(42)
a = torch.ones(3,3)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)
print("Works by multiplying first row of a by first col of b and adding up (1*2 + 1*6 + 1*6 = 14), same thing for 16 (7+4+5) etc.")


a=
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])
Works by multiplying first row of a by first col of b and adding up (1*2 + 1*6 + 1*6 = 14), same thing for 16 (7+4+5) etc.


In [23]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3)) # gives lower triangular part of matrix
#as you progress down the matrix, you progressively ignore 1 less element of b due to the growing # of 1s and shrinking number of 0s
a /= torch.sum(a, 1, keepdim=True)
#now we are able to average the sums going down rows because for each row we are essentially multiplying it by 1/row_num (which is an average)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)


a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [24]:
weights = torch.tril(torch.ones(T,T))
weights = weights / weights.sum(1, keepdim = True)
x_bag_of_words_2 = weights @ x # (B, T, T) @ (B, T, C) --> (B, T, C)
# xbow and xbow2 should be equal but this way is MUCH faster
# essentially we are doing weighted sums by using the triangular torch.tril so that we can only have the matrix access
# only the tokens preceding it

### Self-Attention Mask 3 - Using Softmax

In [25]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) #fill zeros with -inf
wei = F.softmax(wei, dim=-1) # exponentiate and average across each row -> end up with same matrix as the previous two methods
x_bag_of_words_3 = wei @ x
x_bag_of_words_3

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

# Self-Attention Decoder Block

In [39]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn((B,T,C))

#emit a query and a key vector
#Query vector - what am I looking for
#Key vector - what do i contain
#obtain affinities between vectors by effectively doing a dot product between the keys and the queries
# e.g. A query dot products with all the keys and that dot product becomes wei

#Single head that performs self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # B,T,16 (head size)
q = query(x) # B,T,16 (head size)
# Now we have B, T, 16 tensors where B and T are in parallel - no communication
wei = q @ k.transpose(-2, -1) # B,T,16 @ B,16,T ----> B,T,T - Weighted aggregation now is a function of the keys and queries of these nodes


tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T)) # don't want it to be all uniform because some tokens will have natural affinities for or against other tokens in the past (this currently doesn't do that)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x) # what gets aggregated for the purpose of this single head
out = wei @ v 
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [40]:
wei[0] #now wei is data dependent

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

### Notes
- Attention is a **communication mechanism.** Can be seen as nodes in a directed graph looking at each other and aggregating into a single value with a weighted sum from all nodes that point to them with data-dependent weights 
- There is no notion of space
- There is no communication across batch dimensions
- Attention block doesn't necessarily have to only communicate backwards (E.g. Token at Pos 5 will never look forward to Pos 6 to choose what token it should be). In some cases you may want to have all tokens talk to each other (e.g. sentiment analysis where future context words may shed a different light on previous ones). To create such an encoder block, all you have to do is delete wei = wei.masked_fill(tril == 0, float('-inf')), because that removes the mask of -inf and allows for aggregates at all Ts regardless of temporal position
    - What we have currently is called a decoder block
- There is also something called cross-attention. This block of code is called self-attention because the keys, queries, and values are all coming from the same source (x). However, there can be a case where your queries come from x but key and values come from another source. 
    - Called cross attention if there is a separate pool of nodes that we want to pool information from 
- Scaled self attention additionally divides wei by 1/sqrt(head_size) - this makes it so when input Query, Key are dot producted, the resulting variance will be preserved to approximately 1 as opposed to the order of head_size
    - Prevents softmax from being way too peaky because with multiplication it tends to optimize towards the max values, leading to high variance

# Layernorm Explanation

In [19]:
class LayerNorm1d: # (used to be BatchNorm1d)
    """ normalize rows to 1 stdev"""
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdim=True) # batch mean
        xvar = x.var(1, keepdim=True) # batch variance
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out
  
    def parameters(self):
        return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [23]:
x[0,:].mean(), x[0,:].std() # the ROWS are normalized to stdev of 1 and mean of 0 (normal distribution)

(tensor(-9.5367e-09), tensor(1.0000))

# Generate Infinite Lovecraft

In [35]:
model2 = torch.load('gpt-model-lovecraft.pth')

In [37]:
model_output = decode(model2.generate(index = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=2000)[0].tolist())

In [39]:
with open('shakespeare_output.txt', 'w', encoding='utf-8') as f:
    f.write(model_output)