In [1]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
import os
from urllib.request import urlretrieve

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
urlretrieve(url, './input.txt') # create a text file named input

('./input.txt', <http.client.HTTPMessage at 0x2388fca72b0>)

In [2]:
# read it in to inspect it
with open('./input.txt','r', encoding='utf-8') as f :
    text = f.read()

In [3]:
# length of the text in characters
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
# Let's look at the first 1500 charcters
print(text[:1500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# All the unique character that occur in the text
chars = sorted(list(set(text)))  # vocabulary
vocab_size = len(chars)

print(''.join(chars))
print(f'the number of unique characters in the text : {vocab_size}')  # nb of tokens


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
the number of unique characters in the text : 65


In [6]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }

# mapping from integers to characters
itos = { i:ch for i,ch in enumerate(chars) }

# chracter-level tokenizer
# encoder: takes a string, output a list of integers
encode = lambda s: [stoi[c] for c in s] 
# decoder: take a list of integers, output a string
decode = lambda l: ''.join([itos[i] for i in l]) 

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [7]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch
data = torch.tensor(encode(text), dtype= torch.long)
print(data.shape, data.dtype)

# the 1500 characters we looked at earlier that will go the GPT look like this
print(data[:1500])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56,  ..., 58, 53,  1])


In [8]:
# Let's now split up the data into train and validation sets
# first 90% will be train, rest val
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
# context length  ; time dimension
block_size = 8
train_data[:block_size]

tensor([18, 47, 56, 57, 58,  1, 15, 47])

In [10]:
x = train_data[:block_size]     # the input to the transformer
y = train_data[1:block_size+1]  # targets for each position

for t in range(block_size) :
    # In fact, each input example has 'block_size' examples in it
    context = x[:t+1]    # t+1 is exlusive
    target = y[t]
    
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [11]:
torch.manual_seed(1337)  # for reproducibility
batch_size = 4 # B : how many independent sequences will we process in parallel?
block_size = 8 # T : what is the maximum context length for predictions?

def get_batch(split):
    """generate a small batch of data of inputs x and targets y"""
    
    data = train_data if split == 'train' else val_data
    
    # sampling random locations in the dataset to pull chunks from, between 0 and 'len(data) - block_size', shape(batch_size,0)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    
    # stack the rows to become (batch_size,block_size) tensors
    x = torch.stack([data[i:i+block_size] for i in ix])     # we tack because data is a list; shape(B,T)
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # because each example has multiple examples in it; shape(B,T)
    return x, y

xb, yb = get_batch('train')  # b for batch
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension (row dim)
    for t in range(block_size): # time dimension (col dim)
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

In [12]:
# our batch of input to the transformer
print(xb) 

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()  # inherets additional attributes from nn.Module
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # shape(65,65)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C) : Batch, Time(block_size), Channels(vocab_size) embedding dimension

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)  # that's how pytorch's crossentropy treats the logits dim in crossentropy  
            targets = targets.view(B*T)   # reshape the targets (B,T)----> B*T
            loss = F.cross_entropy(logits, targets)  # loss

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """ idx is (B, T) array of indices in the current context in some batch.
        The job of generate is basically to take (B,T) and extend it to be (B,T+1), (B,T+2)....
        it basically continues the generation in all the Batch dimensions in the Time dimension. It will do
        that for  max_new_tokens"""
        
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)  # applies the forward method
            # focus only on the last time step 
            logits = logits[:, -1, :] # (B,T,C) becomes (B, C)
            # apply softmax on the last dimension (hence -1) to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C), remember that C here is vocab_size
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) because from each row we sample 1
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Calculate the loss before training    
m = BigramLanguageModel(vocab_size)   # __init__ is applied : an embedding table (65,65) is created
logits, loss = m(xb, yb)              # forward is applied
print(f"Shape of logits after forward is applied : {logits.shape}")
print(loss)


# Generate 100 tokens without training
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))  

Shape of logits after forward is applied : torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


`idx = torch.zeros((1, 1),dtype=torch.long)` creates a torch tensor of shape(1,1) of type integers, with 0 in it, to kick off the generation. `[0]` to unplack the single batch (because we worked with 1 batch) dimension that exists, and we convert it to a list to feed it to `decode`. 

## Training the bigram model

In [14]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [15]:
batch_size = 32
for steps in range(1000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    
    # zeroing out all the gradients from the previous step
    optimizer.zero_grad(set_to_none=True)
    
    # getting the gradients for all the parameters
    loss.backward()
    
    # using those gradients to update our prameters
    optimizer.step()

# Print the final loss
print(loss.item())

3.721843719482422


In [16]:
# Let's generate 500 tokens after this first training
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


olylvLLko'TMyatyIoconxad.?-tNSqYPsx&bF.oiR;BD$dZBMZv'K f bRSmIKptRPly:AUC&$zLK,qUEy&Ay;ZxjKVhmrdagC-bTop-QJe.H?x
JGF&pwst-P sti.hlEsu;w:w a BG:tLhMk,epdhlay'sVzLq--ERwXUzDnq-bn czXxxI&V&Pynnl,s,Ioto!uvixwC-IJXElrgm C-.bcoCPJ
IMphsevhO AL!-K:AIkpre,
rPHEJUzV;P?uN3b?ohoRiBUENoV3B&jumNL;Aik,
xf -IEKROn JSyYWW?n 'ay;:weO'AqVzPyoiBL? seAX3Dot,iy.xyIcf r!!ul-Koi:x pZrAQly'v'a;vEzN
BwowKo'MBqF$PPFb
CjYX3beT,lZ qdda!wfgmJP
DUfNXmnQU mvcv?nlnQF$JUAAywNocd  bGSPyAlprNeQnq-GRSVUP.Ja!IBoDqfI&xJM AXEHV&DKvRS


## The mathematical trick in self-attention

In [17]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time (context), channels (embedding_size)
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

These 8 tokens are not talking, to each other, and we want them to talk to each other, in such a way that token $t$ only communicates with with previous tokens and itself, and it should not communicate with tokens after it, because those are future tokens in the sequence, **no information come from the future because we're about to predict the future.** (masked attention for decoder-only transformer)

The easiest way for tokens to communicate is calculating the average. So for example, if I'm the 5th token $t_5$, I would like to take the channels that are information at my step (the 5th), but also the channels from the 4th step, 3rd...1st step ($t_4$, $t_3$, $t_2$, $t_1$), and I would like to average those up, and that would become sort of like a **feature vector that summarizes me in the context of my history.** 

Calculating the average is actually extremely weak and lossy form of interaction, where we lose a ton of information about the spatial arrangement of all those tokens, but that's ok for now, we'll see how to bring that information back later.

For now what we would like to do is : For every single batch element independently, for every t-th token in that sequence, we'd like to calculate the average of all the vectors in all the previous tokens and the current token ($i\leq t$).

In [18]:
# We want x[b,t] = mean_{i<=t} x[b,i] 
# bow : bag of words
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # shape(t,C)
        xbow[b,t] = torch.mean(xprev, 0)   # we average on the t (dim=0)

In [19]:
# first element of the batch 
x[0]  #(T,C)

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [20]:
xbow[0] #averging the x[0] on the t (T,C)

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

Each i-th row from `xbow[0]` is the average of the 0,1,...,i-th rows from `x[0]`

## Calculating the average using matrix multiplication for a weighted aggregation

In [21]:
# triangular lower portion
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [22]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)

# these 2 lines of a, help us calculate the averages 
a = torch.tril(torch.ones(3, 3))    # triangular lower portion
a = a / torch.sum(a, 1, keepdim=True)

b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


We use the tringular matrix so that each i-th row from `c`, is the average of rows 0,1,...i-th from `b`.

In [23]:
torch.tril(torch.ones(T, T))

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [24]:
# version 2: using matrix multiply for a weighted aggregation
# wei short for weights

wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)  # for averaging
xbow2 = wei @ x # (T, T) @ (B, T, C)-----Pytorch----->(B, T, T) @ (B, T, C) ----> (B, T, C)
#torch.allclose(xbow, xbow2)  # True

**Explanation of (B, T, T) @ (B, T, C) ----> (B, T, C) :**

In reality, it's **(T, T) @ (B, T, C)**, but Pytorch will come and see that these shapes are not the same, so it will create a batch dimension in (T, T), the mulitplication become **(B, T, T) @ (B, T, C)**. And `@` is a **batched matrix multiply**, and so it will apply this matrix multiplication in all the batch elements in **parallel** and **individually**. And then for each batch element there will be a **(T, T) @ (T, C)**, exaclty what we had above.

## Using Softmax

In [25]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T)) #(T,T)
# all the tril elements which are equal to zero make them equal to -inf
wei = wei.masked_fill(tril == 0, float('-inf')) #(T,T)
# exponentiate every element of wei and divide by their sum; the future cannot communicate with the past
wei = F.softmax(wei, dim=-1)  # dim =-1 refers to the last dimension of wei #(T,T)
print(wei.shape)
print(x.shape)
xbow3 = wei @ x  # (T,T)@(B,T,C)--->(B,T,T)@(B,T,C)--->(B,T,C)
print(xbow3.shape)
#torch.allclose(xbow, xbow3)  # True

torch.Size([8, 8])
torch.Size([4, 8, 2])
torch.Size([4, 8, 2])


## Self-Attention

In [26]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels (we one batch of 4 examples, each example is composed of T tokens, each tokens is represented in C-dim embedding)
x = torch.randn(B,T,C)  #(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False) # dim_input=C, and dim_output=head_size
query = nn.Linear(C, head_size, bias=False) # bias=False means where just calculating a matrix multiplication
value = nn.Linear(C, head_size, bias=False)

# all the tokens in all the positions in (B,T) arrangements will produce (in parallel and independetly) a key and a query,
# no communication has happened yet
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)

# communication happens now, we transpose k in the 2nd and 3rd dimensions
# we're applying T so that the dimensiosn of q and k allign
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

# to avoid the communication with the future : Masked self-attention
tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))

# Calculate similiraty scores (weights)
wei = F.softmax(wei, dim=-1)  #(T,T)

# Calculate attention scores
v = value(x)   # (B, T, 16)
out = wei @ v  # (T,T)@(B, T, 16)-->(B,T,T)@(B,T,16)------->(B,T,16)
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [27]:
# Represents the similarity scores in the first example in the batch
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

Notes:
- Attention is a **communication mechanism**, it can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.

- There is no notion of space. Attention simply acts over a set of vectors. **This is why we need to positionally encode tokens.**

- Each example across batch dimension is of course processed completely independently and never "talk" to each other

- In an **encoder attention block**, we just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a **decoder attention block** because it has **triangular masking**, and is usually used in autoregressive settings, like language modeling.

- **Self-attention** just means that the keys and values are produced from the same source as queries. In **cross-attention**, the queries still get produced from `x`, but the keys and values come from some other, external source (e.g. an encoder module)

- **Scaled attention additional** divides `wei` by $1/\sqrt(HeadSize)$. This makes it so when input Q,K are unit variance ($Var(Q)=Var(K)=1$), `wei` will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below.

In [28]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [29]:
k.var(), q.var(), wei.var()

(tensor(1.0449), tensor(1.0700), tensor(1.0918))

In [30]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [31]:
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1))
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*10, dim=-1))
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*15, dim=-1))
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*100, dim=-1))

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])
tensor([1.5851e-02, 7.8918e-04, 1.1713e-01, 7.8918e-04, 8.6545e-01])
tensor([2.3555e-03, 2.6167e-05, 4.7312e-02, 2.6167e-05, 9.5028e-01])
tensor([4.2484e-18, 3.9754e-31, 2.0612e-09, 3.9754e-31, 1.0000e+00])


**Remark :** When the variance diverges from 1 the softmax gets too peaky, converges to one-hot as seen in the example above.

## Normalization 

The normalization layer normalize the output of the attention mechanisms to stabilize training.

In [32]:
class LayerNorm1d:
    # (used to be BatchNorm1d)  # identical to layernorm in pytorch
    # Nomalizing rows instead of columns (like in the "BatchNorm1d" class in part 3 from the makemore code (part 3))
    def __init__(self, dim, eps=1e-5, momentum=0.1):  
        self.eps = eps  # epsilon term in the normalization
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass
        xmean = x.mean(1, keepdim=True) # batch mean ; shape(x)
        xvar = x.var(1, keepdim=True) # batch variance ; shape(x)
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance (B,T,C)
        self.out = self.gamma * xhat + self.beta # same shape(x)
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]  # trainable parameters

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [33]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs (not normalized) (1st column)

(tensor(0.1469), tensor(0.8803))

In [34]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features (normalized) (1st row)

(tensor(-9.5367e-09), tensor(1.0000))

In [35]:
# French to English translation example:
# Encoder-decoder tranformer
# <--------- ENCODE ------------------><--------------- DECODE ----------------->
# les réseaux de neurones sont fantastiques! <START> neural networks are fantastic!<END>


## Full finished code, for reference

In [36]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000 # max number of iterations in training
eval_interval = 100  # every once in a while (intervals of size eval_interval) evaluate the loss on train and val sets
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu' # The ability to run on a GPU if you have it : use cuda instead of cpu, for faster computations 
eval_iters = 200
n_embd = 64   # embedding dimension (C)
n_head = 4
n_layer = 4   # 4 layers
dropout = 0.0 # regularization
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)  # if cuda is used, we move the data to device
    return x, y

@torch.no_grad() # this context manager torch.no_grad tells Pytroch that everything that happens inside the function "estimate_loss", we will not call .backward() on.
def estimate_loss():
    """it averages up the loss over multiple batches"""
    
    out = {} # empty dict for losses for each iteration in the evaluation
    model.eval() # setting the model into evaluation phase
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y) # outputs of .forward method in "BigramLanguageModel" class 
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # resetting the model into training phase
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__() # inhereting additional attributes from "nn.Module"
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout) # regularization

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities") : No communication with the future
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T) # head_size**-0.5 for scaling
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei) # regularization
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout) # regularization

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # concat over the channel dimension
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout), # regularization
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)   # communication
        self.ffwd = FeedFoward(n_embd)                    # computation
        self.ln1 = nn.LayerNorm(n_embd)                   # normalization
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))    # communication
        x = x + self.ffwd(self.ln2(x))  # computation
        return x

# simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond) # calls the "forward" method
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)   # if cuda is used, it moves the model parameters to device= cude
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'Million parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device) # create on the device when creating the context that feeds into "generate"
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

0.209729 Million parameters
step 0: train loss 4.4116, val loss 4.4022
step 100: train loss 2.6568, val loss 2.6670
step 200: train loss 2.5091, val loss 2.5059
step 300: train loss 2.4193, val loss 2.4334
step 400: train loss 2.3502, val loss 2.3562
step 500: train loss 2.2967, val loss 2.3132
step 600: train loss 2.2408, val loss 2.2499
step 700: train loss 2.2054, val loss 2.2188
step 800: train loss 2.1638, val loss 2.1869
step 900: train loss 2.1246, val loss 2.1511
step 1000: train loss 2.1035, val loss 2.1311
step 1100: train loss 2.0704, val loss 2.1190
step 1200: train loss 2.0383, val loss 2.0801
step 1300: train loss 2.0257, val loss 2.0651
step 1400: train loss 1.9924, val loss 2.0374
step 1500: train loss 1.9694, val loss 2.0288
step 1600: train loss 1.9632, val loss 2.0486
step 1700: train loss 1.9408, val loss 2.0135
step 1800: train loss 1.9085, val loss 1.9942
step 1900: train loss 1.9096, val loss 1.9889
step 2000: train loss 1.8855, val loss 1.9945
step 2100: train l

# Remark : Why do we sample from a multinomial distribution  ?

Using the maximum probability token at each step during generation, also known as **greedy decoding**, can lead to suboptimal results in certain scenarios. Here's why sampling from the multinomial distribution is preferred over greedy decoding:

* **Diversity:** Greedy decoding tends to produce repetitive or deterministic outputs because it always selects the token with the highest probability. This can result in generated sequences lacking diversity and variety. Sampling from the multinomial distribution allows for randomness in token selection, introducing diversity in the generated outputs.

* **Exploration:** Sampling encourages the model to explore different possibilities in the output space. By considering tokens with lower probabilities, the model has the opportunity to generate alternative sequences that may not be immediately apparent based on the highest probability tokens alone. This exploration can lead to more creative and diverse outputs.

* **Handling Uncertainty:** In some cases, the model may have high uncertainty about the next token to generate, with multiple tokens having relatively high probabilities. Sampling allows the model to express this uncertainty by considering multiple tokens with non-negligible probabilities, rather than committing to a single token deterministically.

* **Avoiding Mode Collapse:** Greedy decoding is susceptible to mode collapse, where the model repeatedly generates similar or identical sequences. Sampling helps mitigate this issue by introducing randomness into the generation process, preventing the model from getting stuck in repetitive patterns.

Overall, sampling from the multinomial distribution provides a more flexible and exploratory approach to sequence generation, allowing the model to produce diverse and varied outputs while handling uncertainty and avoiding mode collapse.