<a href="https://colab.research.google.com/github/AlvinKimata/Transformers/blob/main/Transformers%20for%20NLP/GPT%20from%20scratch/GPT_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### This notebook demonstrates building and training a `generative pretrained transformer` from scratch in Pytorch. 

### The link to the tutorial can be found [here.](https://www.youtube.com/watch?v=kCc8FmEb1nY)

In [1]:
#Download the tiny shakespeare daaset.
!wget 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'

--2023-01-20 07:14:13--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-01-20 07:14:13 (20.3 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [2]:
#Inspect the dataset.
with open('input.txt', 'r', encoding = 'utf-8') as f:
  text = f.read()


print(f'Length of dataset in characters is: {len(text)}')

Length of dataset in characters is: 1115394


In [3]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
#Get the nuber of unique characters that occur in the text.
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))

print(f'\n Vocabulary size is: {vocab_size}')


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

 Vocabulary size is: 65


### Encode the text. 


In [5]:
#Create a mapping from characters sto integers.
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] #Encode: Take a string and output a list of integers.
decode = lambda l: ''.join([itos[i] for i in l]) #Decoder: Take a list of integers and output a string.

print(encode('hii there')) # [46, 47, 47, 1, 58, 46, 43, 56, 43]
print(decode(encode('hii there'))) # hii there

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [6]:
#Encode the entire text dataset and store it in a torch.Tensor

import torch
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [7]:
print(data[:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [8]:
 #Split the data in train and test splits.
 n = int(0.9 * len(data))
 train_data = data[:n]
 val_data = data[n:]

In [9]:
block_size = 8 #context size.

x = train_data[:block_size]
y = train_data[1: block_size + 1]

for t in range(block_size):
  context = x[:t + 1]
  target = y[t]

  print(f'When input is {context} the target: {target}')

When input is tensor([18]) the target: 47
When input is tensor([18, 47]) the target: 56
When input is tensor([18, 47, 56]) the target: 57
When input is tensor([18, 47, 56, 57]) the target: 58
When input is tensor([18, 47, 56, 57, 58]) the target: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [10]:
#Batch size.
#Set the seed for reproducibility.

torch.manual_seed(1337)
batch_size = 4

def get_batch(split):
  #Generate a small batch of data of inputs x and target y.
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i: i + block_size] for i in ix])
  y = torch.stack([data[i+1: i + block_size + 1] for i in ix])
  return x, y


xb, yb = get_batch('train')
print(f'Inputs: \n {xb.shape}')
print(f'Target: \n {yb.shape}')

print('-------------------')

for b in range(batch_size):
  print('\n')
  
  for t in range(batch_size):
    context = xb[b, :t + 1]
    target = yb[b, t]

    print(f'When input is {context.tolist()} the target: {target}')

Inputs: 
 torch.Size([4, 8])
Target: 
 torch.Size([4, 8])
-------------------


When input is [24] the target: 43
When input is [24, 43] the target: 58
When input is [24, 43, 58] the target: 5
When input is [24, 43, 58, 5] the target: 57


When input is [44] the target: 53
When input is [44, 53] the target: 56
When input is [44, 53, 56] the target: 1
When input is [44, 53, 56, 1] the target: 58


When input is [52] the target: 58
When input is [52, 58] the target: 1
When input is [52, 58, 1] the target: 58
When input is [52, 58, 1, 58] the target: 46


When input is [25] the target: 17
When input is [25, 17] the target: 27
When input is [25, 17, 27] the target: 10
When input is [25, 17, 27, 10] the target: 0


## Build the model.

In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()

    #Each token directly reads off the logits from the next token from a lookup table.
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets = None):
    logits = self.token_embedding_table(idx) #(batch, time, channel)

    if targets is None:
      loss = None
    else:
      #Reshape logits.
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B * T) #Reduce dimension by one.

      #Negative log-likelihood loss.
      loss = F.cross_entropy(logits, targets) #Measures quality of logits to the target. 

    return logits, loss

  def generate(self, idx, max_new_tokens):
    '''Generate function for the model.'''
    for _ in range(max_new_tokens):
      #Get the predictions.
      logits, loss = self(idx)

      #Focus only on the last timestep.
      logits = logits[:, -1, :]

      #Apply a softmax to get the probabilities.
      probs = F.softmax(logits, dim = -1)

      #Sample from the distribution.
      idx_next = torch.multinomial(probs, num_samples = 1)

      #Append sampled index to the running sequence.
      idx = torch.cat((idx, idx_next), dim = 1)

    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)

print(logits.shape)
print(loss)

idx = torch.zeros((1, 1), dtype = torch.long)
print(decode(m.generate(idx, max_new_tokens = 100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [12]:
from tqdm import tqdm

In [13]:
#Create an optimizer object.
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)


#Training loop.
for steps in tqdm(range(100_000)):

  #Sample a batch of data.
  xb, yb = get_batch('train')

  #Evaluate the loss.
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none = True)
  loss.backward()
  optimizer.step()

print(loss.item())

100%|██████████| 100000/100000 [01:07<00:00, 1487.95it/s]

2.591710329055786





In [14]:
print(decode(m.generate(idx, max_new_tokens = 500)[0].tolist()))


Frestt o rdin ent, iththar lincotheldwe erereshimungan? aghellie k'eshisundat, s midit soberay thenteviofou whasin, hy, sousha thoullowild, th mepoor; do p od ud mee t t by, ks ge my bl YO, s t mp thy,
Whe; pshiede CHawind ker'
ferrdstht che ie ay sesherind, pef h n tithedoffar

IAnouror irinole; V: m ser isomeredethe.
Wishyoumus,

Bisover iplild g f tharsure.
Notnd n'd kedom I'GBUCall;
Thayo whainwag thayofoll, d ck hedy d athale IXE idrsehicen.
PEOUMooy tildan wor ched beed aion sp wilmer?
Fiv


## Self attention workings.

In [30]:
B, T, C = 4, 8, 32 #batch, time, channels
x = torch.randn(B, T, C)

#A single head performing self-attention.
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)

k = key(x) #(B, T, 16)
q = query(x) #(B, T, 16)
weights = q @ k.transpose(-2, -1)

tril = torch.tril(torch.ones(T, T))

weights = weights.masked_fill(tril == 0, float('-inf'))
weights = F.softmax(weights, dim = -1)

v = value(x)
out = weights @ v

out.shape

torch.Size([4, 8, 16])

In [29]:
weights[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0952, 0.9048, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1744, 0.4157, 0.4099, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1710, 0.2650, 0.5410, 0.0230, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3900, 0.1341, 0.3017, 0.0868, 0.0875, 0.0000, 0.0000, 0.0000],
        [0.0622, 0.1017, 0.0434, 0.0682, 0.2119, 0.5126, 0.0000, 0.0000],
        [0.2383, 0.2225, 0.2349, 0.0397, 0.0653, 0.0676, 0.1318, 0.0000],
        [0.2665, 0.0268, 0.0599, 0.0093, 0.1815, 0.0094, 0.1719, 0.2746]],
       grad_fn=<SelectBackward0>)

In [21]:
key, query

(Linear(in_features=32, out_features=16, bias=False),
 Linear(in_features=32, out_features=16, bias=False))