### A basic chatgpt-like language model

* Trained on tiny Shakespeare: 
    
    curl -o input.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

* Character-level

* Mostly replicating this repo: https://github.com/karpathy/nanoGPT

Paper on transformers: https://arxiv.org/pdf/1706.03762.pdf
 GPT: Generative Pre-trained Transforme

In [14]:
%reset -f
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [314]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import seaborn as sns
import random
from typing import List

torch.manual_seed(1337)
random.seed(1337)

In [298]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## Read Data

In [19]:
with open('input.txt', 'r') as f:
    text = f.read()

print(f"number of characters = {len(text):,}")

number of characters = 1,115,394


In [20]:
# first 200 characters of data
print(text[0:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [32]:
# unique characters in the dataset
chars = sorted(list(set(text)))
print(''.join(chars))
print(f"len of unique number of chars = {len(chars)}")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
len of unique number of chars = 65


In [34]:
# defining constants
VOCAB_SIZE = len(chars)

VOCAB_SIZE

65

### Tokenization

https://platform.openai.com/tokenizer

In [60]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

# encode function


def encode(s) -> List[int]:
    """Returns encoded version of input string according to stoi mappings.
    """
    return [stoi[ch] for ch in s]

# decode function


def decode(l) -> str:
    """Returns decode version of input list of characters according to itos mappings.
    """
    return ''.join([itos[i] for i in l])


print(encode('hello ali'))
print(decode([4, 53, 45, 23, 14]))
print(decode(encode('hello ali')))


# Note:
#   Practically, instead of tokenizing characters, We could character words, or sub-words. 
#   This would allow for a lot bigger parameter size (instead of 65), and potentially a lot better performance.
#   OpenAI tiktoken is a good example.

[46, 43, 50, 50, 53, 1, 39, 50, 47]
&ogKB
hello ali


In [64]:
# tokenizing the entire dataset
data = torch.tensor(encode(text))
data.shape

torch.Size([1115394])

In [65]:
print(data[0:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [66]:
# split data into train and validation dataset
n = int(0.9 * len(data))
train_data = data[:n]
valid_data = data[n:]

n, train_data.shape, valid_data.shape

(1003854, torch.Size([1003854]), torch.Size([111540]))

In [90]:
# defining block size: max len of data we use to predict the next character
# our final transformer can see one to BLOCK_SIZE number of characters to make the prediction for next
# it's max context length for prediction

BLOCK_SIZE = 8

# example
for i in range(BLOCK_SIZE + 1):
    print(f"{data[0:i].tolist()} is used to predict -> {data[i+1]}")

[] is used to predict -> 47
[18] is used to predict -> 56
[18, 47] is used to predict -> 57
[18, 47, 56] is used to predict -> 58
[18, 47, 56, 57] is used to predict -> 1
[18, 47, 56, 57, 58] is used to predict -> 15
[18, 47, 56, 57, 58, 1] is used to predict -> 47
[18, 47, 56, 57, 58, 1, 15] is used to predict -> 58
[18, 47, 56, 57, 58, 1, 15, 47] is used to predict -> 47


In [91]:
# this is called time dimension
data[0:BLOCK_SIZE]

tensor([18, 47, 56, 57, 58,  1, 15, 47])

In [92]:
# batch size (how many obs. we use in each iteration for optimization)
BATCH_SIZE = 4

In [299]:
def get_batch(split):
    data = train_data if split == 'train' else valid_data
    idx = torch.randint(0, len(data) - BLOCK_SIZE, (BATCH_SIZE, ))
    x = torch.stack([data[i    : i + BLOCK_SIZE    ] for i in idx])
    y = torch.stack([data[i + 1: i + BLOCK_SIZE + 1] for i in idx])
    x, y = x.to(device), y.to(device)
    return x, y

In [112]:
xb, yb = get_batch('train')

In [113]:
xb.shape, yb.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [115]:
xb, yb

# very important: these are considered independent
# meaning that in the example below, we have 32 examples to train on

(tensor([[47, 64, 43, 52, 10,  0, 13, 56],
         [52, 43,  6,  0, 20, 39, 58, 46],
         [57, 46, 53, 59, 50, 42,  1, 46],
         [41, 39, 52,  1, 63, 47, 43, 50]]),
 tensor([[64, 43, 52, 10,  0, 13, 56, 43],
         [43,  6,  0, 20, 39, 58, 46,  1],
         [46, 53, 59, 50, 42,  1, 46, 39],
         [39, 52,  1, 63, 47, 43, 50, 42]]))

In [128]:
for b in range(BATCH_SIZE):
    for t in range(BLOCK_SIZE):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"input = {context.tolist()} and output = {target}")

# these are input and output of our model which will be fed into transformer.
# the transformer will process these.

input = [47] and output = 64
input = [47, 64] and output = 43
input = [47, 64, 43] and output = 52
input = [47, 64, 43, 52] and output = 10
input = [47, 64, 43, 52, 10] and output = 0
input = [47, 64, 43, 52, 10, 0] and output = 13
input = [47, 64, 43, 52, 10, 0, 13] and output = 56
input = [47, 64, 43, 52, 10, 0, 13, 56] and output = 43
input = [52] and output = 43
input = [52, 43] and output = 6
input = [52, 43, 6] and output = 0
input = [52, 43, 6, 0] and output = 20
input = [52, 43, 6, 0, 20] and output = 39
input = [52, 43, 6, 0, 20, 39] and output = 58
input = [52, 43, 6, 0, 20, 39, 58] and output = 46
input = [52, 43, 6, 0, 20, 39, 58, 46] and output = 1
input = [57] and output = 46
input = [57, 46] and output = 53
input = [57, 46, 53] and output = 59
input = [57, 46, 53, 59] and output = 50
input = [57, 46, 53, 59, 50] and output = 42
input = [57, 46, 53, 59, 50, 42] and output = 1
input = [57, 46, 53, 59, 50, 42, 1] and output = 46
input = [57, 46, 53, 59, 50, 42, 1, 46] and o

In [130]:
VOCAB_SIZE

65

In [241]:
# first example: Bigram model

class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        # this means given current character, we want to assign a probabiliy (score) to the next one
        # previous characters are not impacting our decision
        # each encoded character, will de mapped to VOCAB SIZE (usually 65) numbers.
        self.token_embedding_table = nn.Embedding(
            self.vocab_size, self.vocab_size)

    def forward(self, x, targets=None):
        """Defining forward pass.
        """
        # size: Batch (BATCH_SIZE) x Time (BLOCK_SIZE) x Context (VOCAB_SIZE)
        logits = self.token_embedding_table(x)

        if targets is None:
            loss = None
        else:
            # resize to (BATCH_SIZE * BLOCK_SIZE) * (VOCAB_SIZE)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)            
            targets = targets.view(B * T)        
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, x, max_new_tokens):
        "Generate an instance from the model, for an input x (encoded) which is from a batch"
        # x has size of (B, T)        
        for i in range(max_new_tokens):
            # getting predictions
            logits, loss = self(x) # loss will be None
            # looking at the last character in the time dimension
            logits = logits[:, -1, :] # (B, C)
            # normalize using softmax to find probabilities
            probs = F.softmax(logits, dim=1)
            # predicting next character by sampling from probs
            x_pred = torch.multinomial(probs, num_samples=1) # (B, 1)
            x = torch.cat((x, x_pred), dim=1) # (B, T) -> (B, T + 1)
        return x

In [306]:
model = BigramModel(VOCAB_SIZE)
m = model.to(device)
logits, loss = m(xb, yb)

In [307]:
m(xb)

(tensor([[[-0.0411, -0.4302, -0.4431,  ..., -0.5473, -0.3686, -0.0830],
          [-1.0915, -0.8579, -0.4286,  ...,  1.1406, -0.3613,  0.3341],
          [-1.0915, -0.8579, -0.4286,  ...,  1.1406, -0.3613,  0.3341],
          ...,
          [-1.1176, -1.1180,  1.6360,  ...,  0.1288,  0.7505, -0.6367],
          [-0.4937, -0.3472,  1.0751,  ..., -0.1089, -0.2472,  0.5997],
          [-1.0915, -0.8579, -0.4286,  ...,  1.1406, -0.3613,  0.3341]],
 
         [[-0.0411, -0.4302, -0.4431,  ..., -0.5473, -0.3686, -0.0830],
          [-1.2101, -0.1748,  0.4630,  ...,  0.8997,  0.5066,  0.1376],
          [-1.1176, -1.1180,  1.6360,  ...,  0.1288,  0.7505, -0.6367],
          ...,
          [ 1.1768, -0.4620,  0.8542,  ...,  0.5255,  0.1622, -0.5660],
          [-0.1000, -1.3852,  2.1385,  ...,  1.1285,  0.3761, -0.8725],
          [-1.1176, -1.1180,  1.6360,  ...,  0.1288,  0.7505, -0.6367]],
 
         [[-0.1091, -0.5283,  0.6808,  ...,  0.0643,  1.4150,  1.1548],
          [-0.7981,  0.9894,

In [252]:
# we can start with this
itos[0]

'\n'

In [308]:
# tesing generate function (Batch = 1, Time = 1)
def generate_text(model, size):
    idx = torch.zeros((1, 1), dtype=torch.long, device=device) # initializing with 0, and making sure that we keep the type as long
    generated_text = decode(model.generate(x=idx, max_new_tokens=size)[0].tolist())
    return generated_text

generate_text(model=m, size=10)

'\nwCy.t-eb3v'

In [309]:
# Instead of using stochastic gradient, we can use Adam Oprimizer
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3) # learning rate to be 0.001

In [310]:
EVAL_ITERS = 200

@torch.no_grad() # we don't need any grad calculation here
def estimate_loss():
    out = {}
    m.eval() # setting model into the evaluation mode (will matter if doing batch normalization, etc.)
    for split in ['train', 'valid']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            x_eval, y_eval = get_batch(split)
            logits, loss = m(x_eval, y_eval)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train() # setting model back to training mode
    return out

In [312]:
BATCH_SIZE = 32
EVAL_INTERVAL = 1000

for step in range(10000):

    if (step % EVAL_INTERVAL == 0):
        losses = estimate_loss()
        print(f"step={step} | train loss = {losses['train']} | valid loss = {losses['valid']}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # loss calculation
    logits, loss = m(xb, yb)

    # optimization
    optimizer.zero_grad(set_to_none=True)  # zero grad
    loss.backward()
    optimizer.step()  # update parameters based on the loss

step=0 | train loss = 2.475977897644043 | valid loss = 2.4863996505737305
step=1000 | train loss = 2.4659485816955566 | valid loss = 2.4821033477783203
step=2000 | train loss = 2.4596831798553467 | valid loss = 2.4804158210754395
step=3000 | train loss = 2.459975481033325 | valid loss = 2.479810953140259
step=4000 | train loss = 2.4503164291381836 | valid loss = 2.4873416423797607
step=5000 | train loss = 2.4522807598114014 | valid loss = 2.476492404937744
step=6000 | train loss = 2.458245277404785 | valid loss = 2.482820749282837
step=7000 | train loss = 2.445924997329712 | valid loss = 2.488698959350586
step=8000 | train loss = 2.458848476409912 | valid loss = 2.4849352836608887
step=9000 | train loss = 2.459829330444336 | valid loss = 2.485933780670166


In [315]:
print(generate_text(model=m, size=200))


JOHilche; h co.
Coury?
BURor y, crd wo tarreror thindrariathitoth lll dlenjut, t hin t 's ve het
LAPUSes.

MBar, wiathoffravoue pe best t e thtoufoucive.
LOFaishe thy s rigley, geanuk-
Whandestharyo w
