<a href="https://colab.research.google.com/github/FilipeSquire/LLM-from-Scratch-in-Python/blob/main/GPT_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHATGPT from Scratch

This notebook is based on the content developed by Andrej Karpathy in his youtube channel.

Its purpose is to code a Generatively Pretrained Transformer (GPT)-like model from scratch using python.

In [1]:
# Downloading class dataset - shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-05-13 08:39:59--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-05-13 08:39:59 (42.3 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

print('Length ', len(text))

Length  1115394


In [3]:
#What is the shape of the file content?
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# What are the unique characters in the text?
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars)) #we need .join to not print char separately
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
# How can we tokenize?
# tokenize means convert the raw text as a string into a series of integers
# this is called decoding

# For that we need to map our chars
stoi = { ch:i for i, ch in enumerate(chars)} #char to int
itos = { i:ch for i,ch in enumerate(chars)} #int to char

encode = lambda s: [stoi[c] for c in s] #encoder: take a string, output a list of integers
decode = lambda l: ''.join(itos[i] for i in l) #decoder: take a list of integers, output a string

print(encode('test'))
print(decode(encode('test')))

[58, 43, 57, 58]
test


Future research:

* Google uses sentencepiece (it encodes sub-words)
* openAI uses tiktoken to do this encode and decode job

In [6]:
#wrapping everything into tensors

import torch
data = torch.tensor(encode(text), dtype=torch.long)

print(data.shape, data.dtype)
print(data[:100]) #this the same we've seen before but now encoded

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [7]:
# Lets separate the data into train and test
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
# we dont feed the entire text into transform
# we will work with chunks of the dataset and feed the transformer

block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
# when you set a chunk of dataset
# it actually has a chain into it
# therefore in a chunk of 9 characteres, there is actually 8 in there
# it happens because the chars follows a sequence

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f'when input is {context} the target is {target}')

# the output reveals the 8 examples hidden in a sample of 9 chunks

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [10]:
# the second dimension to care about is the batch dimension

torch.manual_seed(1337)
batch_size = 4 #number of sequences processed in parallel
block_size = 8 #maximum content length for predictions

def get_batch(split):
  #generate small batch of data inputs x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))

  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])

  return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size): # we are setting a 4x8 tensor (Batch)
  for t in range(block_size): #time dimension
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f'when input is {context.tolist()} the target is {target}')

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
when input is [24] the target is 43
when input is [24, 43] the target is 58
when input is [24, 43, 58] the target is 5
when input is [24, 43, 58, 5] the target is 57
when input is [24, 43, 58, 5, 57] the target is 1
when input is [24, 43, 58, 5, 57, 1] the target is 46
when input is [24, 43, 58, 5, 57, 1, 46] the target is 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target is 39
when input is [44] the target is 53
when input is [44, 53] the target is 56
when input is [44, 53, 56] the target is 1
when input is [44, 53, 56, 1] the target is 58
when input is [44, 53, 56, 1, 58] the target is 46


In [11]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


Research: BiGram Model

In [12]:
# introducing ByGram Language Model

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # creating a token of n-table of size vocab_size
    # using embedding that is of shape vocab_size
    #each token reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    #idx and targets are both (B,T) tensor of integers
    logits = self.token_embedding_table(idx) #B,T,C = batch, type, channel = 4 x 8 x 65
    #what is happening is:
    # predicting what comes next based on the individual identity of a single token
    # a good way to measure loss is the negative loss likelihood
    # pytorch cross_entropy doesnt accepts BxTxC
    # so the solve this problem we need to do BxCxT
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T) #we need target to match logit, since it is BxT

      loss = F.cross_entropy(logits, targets)

    return logits, loss
  # now we take current token, generate and add to the previous value
  # a prediction
  def generate(self, idx, max_new_tokens):
    #idx is (B,T) array of indices in the current context
    for _ in range(max_new_tokens):
      #get the predictions
      logits, loss = self(idx)
      #focus only on the last time step
      logits = logits[:, -1, :] #becomes B,C
      #apply softmax to get probs
      probs = F.softmax(logits, dim=-1) # B,C
      #sample from distribution
      idx_next = torch.multinomial(probs, num_samples=1) #B,1
      #append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) #B, T+1

    return idx

m = BigramLanguageModel(vocab_size)
logs, loss = m(xb, yb)
print(logs.shape)
print(loss)

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


The output below will be random because the model has not been trained

In [13]:
idx = torch.zeros((1,1), dtype=torch.long) #creating small tensor 1x1
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))


Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [14]:
# creating a pytorch optimizer

optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

Below we will create a small training loop to check the code is happening

In [15]:
batch_size = 32
for steps in range(10000):

  #sample a batch of data
  xb, yb = get_batch('train')

  #evaluate the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.5727508068084717


In [16]:
idx = torch.zeros((1,1), dtype=torch.long) #creating small tensor 1x1
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y helti


Yey! checking again, it seems that the model started making progress.

We need to make the tokens start talking between each other

# Mathematics into self-attention

In [17]:
# consider the following toy example:

# The problem is that we'd like the tokens to talk between each other
# but they cant talk with any token
# they need to talk only with the ones before it,
# and it cannot talk with the ones in the future, because we will predict it
torch.manual_seed(1337)
B,T,C = 4,8,2 #batch, time, channel

x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [18]:
# For every token, we would like to calculate the average of all the
# vectors in the previous tokens, and the current one
torch.manual_seed(1337)
# therefore we want x[b,t] = mean_{i<=t} x [b,i]
xbow = torch.zeros((B,T,C)) #bag of words averaging
for b in range(B):
  for t in range(T):
    xprev = x[b, :t+1] # (t,C)
    xbow[b,t] = torch.mean(xprev, 0)
xbow
# The code above is a python implementation of a moving average
# this is a costly and inefficient algorithm
# another way of implementing the same thing in a better way
# is a matrix multiplication

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [19]:
# To do this we need a triangular matrix of 1s in the same shape
# as the matrix we will multiply.
# Afterwards we will divide each row by its sum, so that we have the
# weight per value of that row
# afterwards we multiply the matrix
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print('a')
print(a)
print('b')
print(b)
print('c')
print(c)

a
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [20]:
# getting back to the original code
torch.manual_seed(1337)
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (T,T, T) @ (B,T,C) ---> (B,T,C)

The reason why using softmax, is because although the future tokens will be zero, at some points they will start iterating between themselves, and we need to make sure that the current and previous tokens cannot communicate with the future tokens.

In [21]:
#using softmax
torch.manual_seed(1337)
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
xbow3

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

# Code Update 3

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


In [21]:
# introducing ByGram Language Model

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    # creating a token of n-table of size vocab_size
    # using embedding that is of shape vocab_size
    #each token reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embeddin_table = nn.Embedding(block_size, n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)
  def forward(self, idx, targets=None):

    #idx and targets are both (B,T) tensor of integers
    tok_emb = self.token_embedding_table(idx) #B,T,C = batch, type, channel = 4 x 8 x 65
    logits = self.lm_head(tok_emb) #  Bt,C,vocab_size
    #what is happening is:
    # predicting what comes next based on the individual identity of a single token
    # a good way to measure loss is the negative loss likelihood
    # pytorch cross_entropy doesnt accepts BxTxC
    # so the solve this problem we need to do BxCxT
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T) #we need target to match logit, since it is BxT

      loss = F.cross_entropy(logits, targets)

    return logits, loss
  # now we take current token, generate and add to the previous value
  # a prediction
  def generate(self, idx, max_new_tokens):
    #idx is (B,T) array of indices in the current context
    for _ in range(max_new_tokens):
      #get the predictions
      logits, loss = self(idx)
      #focus only on the last time step
      logits = logits[:, -1, :] #becomes B,C
      #apply softmax to get probs
      probs = F.softmax(logits, dim=-1) # B,C
      #sample from distribution
      idx_next = torch.multinomial(probs, num_samples=1) #B,1
      #append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) #B, T+1

    return idx

m = BigramLanguageModel()
logs, loss = m(xb, yb)
print(logs.shape)
print(loss)