### A basic chatgpt-like language model

* Trained on tiny Shakespeare: 
    
    curl -o input.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

* Character-level

* Mostly replicating this repo: https://github.com/karpathy/nanoGPT

Paper on transformers: https://arxiv.org/pdf/1706.03762.pdf
 GPT: Generative Pre-trained Transforme

In [14]:
%reset -f
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [129]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import seaborn as sns
import random
from typing import List

torch.manual_seed(1337)
random.seed(1337)

## Read Data

In [19]:
with open('input.txt', 'r') as f:
    text = f.read()

print(f"number of characters = {len(text):,}")

number of characters = 1,115,394


In [20]:
# first 200 characters of data
print(text[0:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [32]:
# unique characters in the dataset
chars = sorted(list(set(text)))
print(''.join(chars))
print(f"len of unique number of chars = {len(chars)}")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
len of unique number of chars = 65


In [34]:
# defining constants
VOCAB_SIZE = len(chars)

VOCAB_SIZE

65

### Tokenization

https://platform.openai.com/tokenizer

In [60]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

# encode function


def encode(s) -> List[int]:
    """Returns encoded version of input string according to stoi mappings.
    """
    return [stoi[ch] for ch in s]

# decode function


def decode(l) -> str:
    """Returns decode version of input list of characters according to itos mappings.
    """
    return ''.join([itos[i] for i in l])


print(encode('hello ali'))
print(decode([4, 53, 45, 23, 14]))
print(decode(encode('hello ali')))


# Note:
#   Practically, instead of tokenizing characters, We could character words, or sub-words. 
#   This would allow for a lot bigger parameter size (instead of 65), and potentially a lot better performance.
#   OpenAI tiktoken is a good example.

[46, 43, 50, 50, 53, 1, 39, 50, 47]
&ogKB
hello ali


In [64]:
# tokenizing the entire dataset
data = torch.tensor(encode(text))
data.shape

torch.Size([1115394])

In [65]:
print(data[0:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [66]:
# split data into train and validation dataset
n = int(0.9 * len(data))
train_data = data[:n]
valid_data = data[n:]

n, train_data.shape, valid_data.shape

(1003854, torch.Size([1003854]), torch.Size([111540]))

In [90]:
# defining block size: max len of data we use to predict the next character
# our final transformer can see one to BLOCK_SIZE number of characters to make the prediction for next
# it's max context length for prediction

BLOCK_SIZE = 8

# example
for i in range(BLOCK_SIZE + 1):
    print(f"{data[0:i].tolist()} is used to predict -> {data[i+1]}")

[] is used to predict -> 47
[18] is used to predict -> 56
[18, 47] is used to predict -> 57
[18, 47, 56] is used to predict -> 58
[18, 47, 56, 57] is used to predict -> 1
[18, 47, 56, 57, 58] is used to predict -> 15
[18, 47, 56, 57, 58, 1] is used to predict -> 47
[18, 47, 56, 57, 58, 1, 15] is used to predict -> 58
[18, 47, 56, 57, 58, 1, 15, 47] is used to predict -> 47


In [91]:
# this is called time dimension
data[0:BLOCK_SIZE]

tensor([18, 47, 56, 57, 58,  1, 15, 47])

In [92]:
# batch size (how many obs. we use in each iteration for optimization)
BATCH_SIZE = 4

In [111]:
def get_batch(split):
    data = train_data if split == 'train' else valid_data
    idx = torch.randint(0, len(data) - BLOCK_SIZE, (BATCH_SIZE, ))
    x = torch.stack([data[i    : i + BLOCK_SIZE    ] for i in idx])
    y = torch.stack([data[i + 1: i + BLOCK_SIZE + 1] for i in idx])
    return x, y

In [112]:
xb, yb = get_batch('train')

In [113]:
xb.shape, yb.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [115]:
xb, yb

# very important: these are considered independent
# meaning that in the example below, we have 32 examples to train on

(tensor([[47, 64, 43, 52, 10,  0, 13, 56],
         [52, 43,  6,  0, 20, 39, 58, 46],
         [57, 46, 53, 59, 50, 42,  1, 46],
         [41, 39, 52,  1, 63, 47, 43, 50]]),
 tensor([[64, 43, 52, 10,  0, 13, 56, 43],
         [43,  6,  0, 20, 39, 58, 46,  1],
         [46, 53, 59, 50, 42,  1, 46, 39],
         [39, 52,  1, 63, 47, 43, 50, 42]]))

In [128]:
for b in range(BATCH_SIZE):
    for t in range(BLOCK_SIZE):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"input = {context.tolist()} and output = {target}")

# these are input and output of our model which will be fed into transformer.
# the transformer will process these.

input = [47] and output = 64
input = [47, 64] and output = 43
input = [47, 64, 43] and output = 52
input = [47, 64, 43, 52] and output = 10
input = [47, 64, 43, 52, 10] and output = 0
input = [47, 64, 43, 52, 10, 0] and output = 13
input = [47, 64, 43, 52, 10, 0, 13] and output = 56
input = [47, 64, 43, 52, 10, 0, 13, 56] and output = 43
input = [52] and output = 43
input = [52, 43] and output = 6
input = [52, 43, 6] and output = 0
input = [52, 43, 6, 0] and output = 20
input = [52, 43, 6, 0, 20] and output = 39
input = [52, 43, 6, 0, 20, 39] and output = 58
input = [52, 43, 6, 0, 20, 39, 58] and output = 46
input = [52, 43, 6, 0, 20, 39, 58, 46] and output = 1
input = [57] and output = 46
input = [57, 46] and output = 53
input = [57, 46, 53] and output = 59
input = [57, 46, 53, 59] and output = 50
input = [57, 46, 53, 59, 50] and output = 42
input = [57, 46, 53, 59, 50, 42] and output = 1
input = [57, 46, 53, 59, 50, 42, 1] and output = 46
input = [57, 46, 53, 59, 50, 42, 1, 46] and o

In [130]:
VOCAB_SIZE

65

In [170]:
# first example: Bigram model

class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        # this means given current character, we want to assign a probabiliy (score) to the next one
        # previous characters are not impacting our decision
        self.token_embedding_table = nn.Embedding(self.vocab_size, self.vocab_size)


    def forward(self, x, targets):
        """Defining forward pass.
        """
        # size: Batch (BATCH_SIZE) x Time (BLOCK_SIZE) x Channel (VOCAB_SIZE)
        logits = self.token_embedding_table(x)

        # nll, expects (B, C, T)
        loss = F.cross_entropy(logits, targets)
        print(logits.shape, targets.shape)

In [171]:
m = BigramModel(VOCAB_SIZE)

In [172]:
m(xb, yb)

RuntimeError: Expected target size [4, 65], got [4, 8]

In [163]:
yb

tensor([[64, 43, 52, 10,  0, 13, 56, 43],
        [43,  6,  0, 20, 39, 58, 46,  1],
        [46, 53, 59, 50, 42,  1, 46, 39],
        [39, 52,  1, 63, 47, 43, 50, 42]])

In [159]:
m.token_embedding_table(xb).shape

torch.Size([4, 8, 65])

In [151]:
xb

tensor([[47, 64, 43, 52, 10,  0, 13, 56],
        [52, 43,  6,  0, 20, 39, 58, 46],
        [57, 46, 53, 59, 50, 42,  1, 46],
        [41, 39, 52,  1, 63, 47, 43, 50]])

In [150]:
xb.shape

torch.Size([4, 8])

In [147]:
m.vocab_size

65

In [148]:
print(m.token_embedding_table)

Embedding(65, 65)


In [149]:
m.token_embedding_table.weight.shape, m.token_embedding_table.weight

(torch.Size([65, 65]),
 Parameter containing:
 tensor([[ 0.7761, -0.0236,  0.7694,  ...,  0.8682,  0.6140,  0.3228],
         [-0.1667,  0.2752, -0.6079,  ..., -0.9631, -0.1148,  0.6909],
         [ 0.4106,  0.5041,  1.2860,  ..., -1.3456, -0.7963, -1.2198],
         ...,
         [-0.7582, -1.8711,  0.7141,  ..., -0.5707, -0.4843, -0.0299],
         [ 1.1441,  1.4002, -0.5325,  ...,  0.6729,  0.9766, -2.2641],
         [-0.4311, -0.2802,  1.9070,  ...,  1.5897, -1.1810, -0.7791]],
        requires_grad=True))