<a href="https://colab.research.google.com/github/AanchalA/MakeMore-with-Andrej-Karpathy/blob/main/NanoGPT_on_TinyShakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import random
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

# HyperParameters

In [None]:
# HyperParameters

BLOCK_SIZE = 8                      ## Context Length: how many characters do we take to predict the next one?
BATCH_SIZE = 32
MAX_ITERS = 3000
EVAL_ITERS = 200
EVAL_INTERVAL = 300
LEARNING_RATE = 1e-3                ## Actual Models usually lr=3e-4
NUM_EMBEDDING_DIMS = 32             ## Number of embedding dimensions
# NUM_HIDDEN_UNITS = 128            ## Number of Neurons in the Hidden Layer of the MLP

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# g = torch.Generator().manual_seed(2147483647)                   # for reproducibility
torch.manual_seed(1337);          ## seed rng for reproducibility

# Building the DataSet

In [None]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
! wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-02-21 05:11:37--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-02-21 05:11:38 (54.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Length of dataset in characters: {len(text)} Characters")

Length of dataset in characters: 1115394 Characters


In [None]:
# let's look at the first 1000 characters
print(text[365:1000])


All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.




In [None]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
print(f"List of all unique characters: {''.join(chars)}")

vocab_size = len(chars)
print(f"\n{vocab_size=}")

List of all unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

vocab_size=65


In [None]:
# create a mapping from characters to integers

stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

## Text Encoder and Decoder

In [None]:
# encoder: take a string, output a list of integers
encode = lambda s: [stoi[ch] for ch in s]

# decoder: take a list of integers, output a string
decode = lambda l: ''.join([itos[ind] for ind in l])

In [None]:
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


### Encoding the text

In [None]:
# let's now encode the entire text dataset and store it into a torch.tensor

data = torch.tensor(encode(text), dtype=torch.long)             ## torch.long = int

print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


## Train - Validation Split

In [None]:
# Let's now split up the data into train and validation sets

n = int(0.9 * len(data))            # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
print(f"Train Data Size: {len(train_data)}")
print(f"Validation Data Size: {len(val_data)}")

Train Data Size: 1003854
Validation Data Size: 111540


## Data Batching

In [None]:
BLOCK_SIZE              # BLOCK_SIZE - what is the maximum context length for predictions?

8

In [None]:
train_data[: BLOCK_SIZE + 1]                ## Each "block_size" will have "block_size" number of individual training examples, "block_size+1" will be the target for the entire context in bloak_size

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
x = train_data[: BLOCK_SIZE]
y = train_data[1 : BLOCK_SIZE + 1]

for time_step in range(BLOCK_SIZE):
    context = x[: time_step + 1]
    target = y[time_step]
    print(f"When input is {context}, the target is: {target}")

When input is tensor([18]), the target is: 47
When input is tensor([18, 47]), the target is: 56
When input is tensor([18, 47, 56]), the target is: 57
When input is tensor([18, 47, 56, 57]), the target is: 58
When input is tensor([18, 47, 56, 57, 58]), the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 58


In [None]:
BATCH_SIZE              # BATCH_SIZE - how many independent sequences will we process in parallel?

32

In [None]:
def get_batch(split):

    # generate a small batch of data of inputs x and targets y

    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE, ))              ## Picking BATCH_SIZE number of random offsets into the dataset.
    x = torch.stack([data[i : i + BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i + 1 : i + BLOCK_SIZE + 1] for i in ix])
    x, y = x.to(DEVICE), y.to(DEVICE)
    return x, y

In [None]:
xb, yb = get_batch('train')

print('inputs:\n')
print(f"SHAPE: {xb.shape}")
# print(f"DATA: {xb}")

print('\n', '-'*45, '\n')

print('targets:\n')
print(f"SHAPE: {yb.shape}")
# print(f"DATA: {yb}")

inputs:

SHAPE: torch.Size([32, 8])

 --------------------------------------------- 

targets:

SHAPE: torch.Size([32, 8])


In [None]:
for batch in range(BATCH_SIZE):                 ## Batch Dimension
    for time_step in range(BLOCK_SIZE):             ## Time Dimension - for each time_step in the context window.
        context = xb[batch, : time_step + 1]
        target = yb[batch, time_step]
        # print(f"When the context is {context.tolist()}, the target is: {target}")
# There will be a total of BATCH_SIZE * BLOCK_SIZE number of examples.

In [None]:
# Out input to the transformer
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54],
        [57, 43, 60, 43, 52,  1, 63, 43],
        [60, 43, 42,  8,  0, 25, 63,  1],
        [56, 42,  5, 57,  1, 57, 39, 49],
        [43, 57, 58, 63,  6,  1, 58, 46],
        [43,  1, 51, 39, 63,  1, 40, 43],
        [58, 46, 43,  1, 43, 39, 56, 57],
        [39, 58, 47, 53, 52, 12,  1, 37],
        [53, 56, 43,  1, 21,  1, 41, 39],
        [50, 39, 52, 63,  1, 47, 58, 57],
        [56, 53, 63,  1, 42, 47, 42,  1],
        [39, 51,  1, 39, 44, 56, 39, 47],
        [17, 24, 21, 38, 13, 14, 17, 32],
        [ 1, 39, 52, 42,  1, 45, 43, 50],
        [ 1, 58, 46, 39, 58,  1, 42, 53],
        [ 1, 61, 53, 59, 50, 42,  1, 21],
        [59, 57, 40, 39, 52, 42,  1, 40],
        [52, 42,  8,  0,  0, 23, 21, 26],
        [45, 53, 42, 57,  0, 23, 43, 43],
        [52,  1, 61, 39, 57,  1, 51, 53],
        [39, 49, 12,  1, 27,  1, 5

# Model Evaluation Function

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Bi-Gram Model

In [None]:
# PyTorch 2.0
# model = torch.compile(model)                    ## Model acceleration- Needs A100???
# Also look at acclerate lib - they have released a few updated that you can use.

In [None]:
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Each token directly reads off the logits for the next token from the look-up table
        self.token_embedding_table = nn.Embedding(vocab_size, NUM_EMBEDDING_DIMS)               ## As the model learns, these embedding weights get updated.
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, NUM_EMBEDDING_DIMS)            ## Each token in the context (from 0 to BLOCK_SIZE - 1) will have have its own position vector
        self.lm_head = nn.Linear(NUM_EMBEDDING_DIMS, vocab_size)


    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B, T) tensor of integers (B- Batch size, T- Time/ContextLength/BLOCK_SIZE)
        token_embeddings = self.token_embedding_table(idx)        ## (B, T, C) -- (C - Number of channels, i.e. NUM_EMBEDDING_DIMS).
        pos_embeddings = self.position_embedding_table(torch.arange(T, device=DEVICE))           ## (T, C) - Embeddings for integers from 0 to T-1. All these get embeded through the table.
        x = token_embeddings + pos_embeddings
        logits = self.lm_head(x)                     ## (B, T, vocab_size). Logits - scores for the next character in the sequence.

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)                  ## 3  Dimnesional --> 2 Dimnesional. Multiplying B and T will stretch all the characters (B * T) into a single row while maintaining Channel dimension
            targets = targets.view(B * T)                   ## 2 Dimensional --> 1 Dimensional. Multiplying B and T will stretch all the characters (B * T) into a single row.
            loss = F.cross_entropy(logits, targets)         ## cross_entropy --> Negative log-lieklihood = -log(probability(idx)) - Base Loss = -log(1/vocab_size)

        return logits, loss


    def generate(self, idx, max_new_tokens):
        """ generate will add characters one by one at time step t+1 upto t + max_new_tokens (in time dimension) for every batch.
        idx array gets updated be concatenating every new character prediction to idx """

        ## idx (B, T) - array of indices of some characters in the current context in a batch.
        for _ in range(max_new_tokens):

            # Get predictions from model
            logtis, loss = self(idx)                                        ## Calling the forward() function

            # Focussing on only the last time-step
            logtis = logtis[:, -1, :]       ## (B, T, C) becomes (B, C). Plucking out the the last element in the time dimension, as that is the prediction for what comes next

            # Applying softmax to get probabilities
            probs = F.softmax(logtis, dim=-1)       ## (B, C)

            # Sampling from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)      ## (B, 1) - Picking the next character (one - num_samples) index that has the highest probability. One new character per batch.

            # Appending sampled index to the running sequence.
            idx = torch.cat((idx, idx_next), dim=1)             ## (B, T + 1). Concatenating the idx of the generated character (idx_next) at time dimension (dim=1)

        return idx

In [None]:
model = BigramLanguageModel()
model.to(DEVICE)

optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)           ## Actual Models usually lr=3e-4

In [None]:
logits, loss = model(xb, yb)
print(f"Logtis Shape: {logits.shape}")
print(f"{loss=}")

Logtis Shape: torch.Size([256, 65])
loss=tensor(4.4157, device='cuda:0', grad_fn=<NllLossBackward0>)


In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)                 ## (1, 1) - one batch, context length = 1
print(decode(model.generate(context, max_new_tokens=400)[0].tolist()))


?qf;xbDkRZkNdc'wf,ZTkOLOn,eCtK

HqPjCkMBbAAU!:XaSvgO-33jMBF?gaTauhXFYVXJtpXeNuwqcBCxv.t?aF dXl!DZaAeWFwccHwyRWf,fDEZaYzxzrEom:
Yo3&$FmtofCiaIvB!!BV!$W;nd!lNxc
e3 ixYe-EYnkciK;lSq;HFtEZkoG EtSXMB;qWklG.YGZW.FeWjbm!pelJljnFAUVQv.t-hxD3qcdcpvDN:?SuO;MOie'XVUwty.OJlvBPUHI.cBm&pjY-lgvIEjVk:D:lqwJdlGMtS!klGoRW-SQAFQPdGCeIib3qI'TStC&lE$HZLETxgeF3QJ$FsLp-LB3:Ar-xT3H

epkO
mnvnrufW!A '
;;3;QDLWwm:f'E,Cey$f


In [None]:
MAX_ITERS = 10000

for iter in range(MAX_ITERS):

    # Every once in a while evaluate the loss on train and val sets
    if iter % EVAL_INTERVAL == 0:
        losses = estimate_loss()
        print(f"STEP {iter}: Train Loss - {losses['train']:.4f} \t Validation Loss - {losses['val']:.4f}")

    # Sample a batch of data
    xb, yb = get_batch("train")

    # Evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

STEP 0: Train Loss - 4.3886 	 Validation Loss - 4.3734
STEP 300: Train Loss - 2.9031 	 Validation Loss - 2.9028
STEP 600: Train Loss - 2.6507 	 Validation Loss - 2.6722
STEP 900: Train Loss - 2.5712 	 Validation Loss - 2.5843
STEP 1200: Train Loss - 2.5505 	 Validation Loss - 2.5602
STEP 1500: Train Loss - 2.5195 	 Validation Loss - 2.5373
STEP 1800: Train Loss - 2.5092 	 Validation Loss - 2.5273
STEP 2100: Train Loss - 2.5053 	 Validation Loss - 2.5090
STEP 2400: Train Loss - 2.4930 	 Validation Loss - 2.5112
STEP 2700: Train Loss - 2.5030 	 Validation Loss - 2.5096
STEP 3000: Train Loss - 2.4877 	 Validation Loss - 2.5090
STEP 3300: Train Loss - 2.4922 	 Validation Loss - 2.4938
STEP 3600: Train Loss - 2.4760 	 Validation Loss - 2.5041
STEP 3900: Train Loss - 2.4885 	 Validation Loss - 2.5042
STEP 4200: Train Loss - 2.4822 	 Validation Loss - 2.4993
STEP 4500: Train Loss - 2.4729 	 Validation Loss - 2.4895
STEP 4800: Train Loss - 2.4711 	 Validation Loss - 2.4919
STEP 5100: Train Los

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)                 ## (1, 1) - one batch, context length = 1
print(decode(model.generate(context, max_new_tokens=400)[0].tolist()))


MPUESTh, af Pre?

WISo myouryoube!
KENoby ak
Sadsal thes ghesthidin cour ay aney Iry ts I fr t ce.
Jonghe nd, bemary.
Yor 'sour mend sora an hy t--pond betwe ten.
Sand thoware in s th llety od, wiourco ffepyotssththas l.
TAn.
Mourethal intherse ed Ped he movetour?
Cassce oros cok hedin tie s ind aus te fe f tas ny, ct CINovecest hes, n id, I fo, mo mane.

Anthataker aghercobun ws m k s withoumas F


# Self-Attention

In [None]:
# Information aggregation using average for every individual batch
# Averaging the weights (embeddings of each token) upto time_step t of context for every time_step will aggregrate all the imformation upto time_step t
# This is very in efficient, lossy form of aggregation
# This information can only be done upto the current time-step past tokens cannot know what the future holds
# This is basically a weighted aggregration

## Version 1

In [None]:
B, T, C = 4, 8, 2               ## B - BATCH SIZE, T - CONTEXT LENGTH (TIME DIMENSION), C - CHANNEL SIZE (EMBEDDING SIZE)
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# We want x[b, t] = mean_{i <= t} for x[b, i]

xbow = torch.zeros((B, T, C))               ## BOW - Bag of words - said when putting averages together.

for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]                   ## (t, C). We want to take the average of the embeddings upto the current time step
        xbow[b, t] = torch.mean(xprev, dim=0)

In [None]:
x[0]

tensor([[ 0.0939, -1.0366],
        [ 2.3651, -1.1340],
        [ 0.1826,  0.4906],
        [ 0.4017, -0.3161],
        [-0.7491, -0.7988],
        [-1.3352, -1.3028],
        [ 0.0301,  1.5415],
        [ 0.4123, -0.1268]])

In [None]:
xbow[0]

tensor([[ 0.0939, -1.0366],
        [ 1.2295, -1.0853],
        [ 0.8805, -0.5600],
        [ 0.7608, -0.4990],
        [ 0.4588, -0.5590],
        [ 0.1598, -0.6830],
        [ 0.1413, -0.3652],
        [ 0.1752, -0.3354]])

## Version 2

In [None]:
# The aggregration process above can easily be achieved using matrix multiplication
# Using the lower triangle matrix [1\0] - ones in lower half of the traingle, zreos in the upper half
# Multiplying the embeddings vector with the lower triangle will set the information for all the tokens after the current token to be zero
# This will allow us to agregrate embeddings upto the current timestep in the context.

torch.manual_seed(42)
a = torch.ones(3, 3)
a = torch.tril(a)
a = a / torch.sum(a, dim=1, keepdims=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b                                                           ## Aggregration through matrix multiplication

print(f"a = \n{a}\n\n{'-'*30}\n")
print(f"b = \n{b}\n\n{'-'*30}\n")
print(f"c = \n{c}\n\n{'-'*30}\n")

a = 
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

------------------------------

b = 
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])

------------------------------

c = 
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])

------------------------------



In [None]:
weight = torch.tril(torch.ones(T, T))
weight = weight / weight.sum(dim=1, keepdims=True)              ## Normalizing
xbow2 = weight @ x                                              ## Aggregration through matrix multiplication
                                                                ## weight (T, T) @ x (B, T, C): Batched matrix multiplication ----> Here "B" gets broadcasted as (B, T, T) @ (B, T, C) ==> (B, T, C)
xbow2.shape

torch.Size([4, 8, 2])

In [None]:
torch.allclose(xbow, xbow2)         ## XBOW === XBOW2

True

In [None]:
xbow[0], xbow2[0]

(tensor([[ 0.0939, -1.0366],
         [ 1.2295, -1.0853],
         [ 0.8805, -0.5600],
         [ 0.7608, -0.4990],
         [ 0.4588, -0.5590],
         [ 0.1598, -0.6830],
         [ 0.1413, -0.3652],
         [ 0.1752, -0.3354]]),
 tensor([[ 0.0939, -1.0366],
         [ 1.2295, -1.0853],
         [ 0.8805, -0.5600],
         [ 0.7608, -0.4990],
         [ 0.4588, -0.5590],
         [ 0.1598, -0.6830],
         [ 0.1413, -0.3652],
         [ 0.1752, -0.3354]]))

## Version 3
- Using SoftMax()
- We use softmax in self attention because, the weights (i.e. the interaction strenght or token affinity) begin with zero.
- These weights form the lower⛛ matrix tell us how much of each token from the past do we want to aggregrate and average up.
- By setting 0 from lower⛛ matrix to "-inf", we are saying that tokens token from the past cannot communicate with the tokens from the future. No information will be aggregrated from locations where the value is -inf.
- The affinities between tokens are going to be data dependent and not be set to zero constantly. And some tokens are going to find some other tokens more or less interesting depending on their values.

In [None]:
tril = torch.tril(torch.ones(T, T))
weight = torch.zeros((T, T))
weight = weight.masked_fill(tril==0, float('-inf'))         ## In tril replace all 0 with -inf
weight = F.softmax(weight, dim=1)                           ## softmax - normalizing function. exp(0)=1 and exp(inf)=0
xbow3 = weight @ x                                          ## Aggregration through matrix multiplication

In [None]:
torch.allclose(xbow, xbow3)                             ## xbow == xbow3

True

In [None]:
xbow[0], xbow3[0]

(tensor([[ 0.0939, -1.0366],
         [ 1.2295, -1.0853],
         [ 0.8805, -0.5600],
         [ 0.7608, -0.4990],
         [ 0.4588, -0.5590],
         [ 0.1598, -0.6830],
         [ 0.1413, -0.3652],
         [ 0.1752, -0.3354]]),
 tensor([[ 0.0939, -1.0366],
         [ 1.2295, -1.0853],
         [ 0.8805, -0.5600],
         [ 0.7608, -0.4990],
         [ 0.4588, -0.5590],
         [ 0.1598, -0.6830],
         [ 0.1413, -0.3652],
         [ 0.1752, -0.3354]]))

# Version 4: Self Attention
- Self attention implementation for a single head.

- https://www.youtube.com/watch?v=U0s0f995w14&ab_channel=AladdinPersson
- https://www.youtube.com/watch?v=pkVwUVEHmfI&ab_channel=AladdinPersson
- https://peterbloem.nl/blog/transformers
- https://jalammar.github.io/visualizing-neural-machine-translation-mechanics-of-seq2seq-models-with-attention/

In [None]:
# Scaled Dot Product Attention: Attention(Q, K, V) = softmax(Q * K^t / sqrt(dk)) * V
class SelfAttention(nn.Module):
    def __init__(self, num_heads, EMBEDDING_DIMS):
        pass