# Building GPT

## Corpus
**Source** : https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
local_path = "./data/gpt-input.txt"

In [3]:
# read it in to inspect it
with open(local_path, 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
print("length of the corpus in characters: ", len(text))

length of the corpus in characters:  1115393


In [5]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [6]:
# unique characters that occur in this corpus
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## Tokenization 

### Simplest Character-Level Encoding/Decoding

In [7]:
# mapping from characters to integers and  vice versa
ctoi = { ch:i for i,ch in enumerate(chars) }
itoc = { i:ch for i,ch in enumerate(chars) }

# encoder: take a string, output a list of integers
encode = lambda s: [ctoi[c] for c in s]

# decoder: take a list of integers, output a string
decode = lambda n: ''.join([itoc[i] for i in n])

greeting = "Hi, How are you?"
print(encode(greeting))
print(decode(encode(greeting)))

[20, 47, 6, 1, 20, 53, 61, 1, 39, 56, 43, 1, 63, 53, 59, 12]
Hi, How are you?


In [8]:
itoc

{0: '\n',
 1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z'}

### SentencePiece
by **Google** \
https://github.com/google/sentencepiece

Imagine you have a magical tool that can slice words into smaller, bite-sized pieces, unlocking a world of efficiency and flexibility in language processing. This tool, known as SentencePiece, doesn't just deal with whole words or individual characters—it's like a linguistic chef, breaking down words into flavorful subword morsels.

### TikToken
by **OpenAI** \
https://platform.openai.com/tokenizer \
https://github.com/openai/tiktoken

TikToken acts as your linguistic compass, swiftly breaking down the complex tapestry of language into bite-sized tokens, much like a skilled artisan carefully carving intricate sculptures from a block of marble. With the finesse of a master craftsman, TikToken employs the power of Byte Pair Encoding (BPE) to dissect text into its elemental units, allowing OpenAI's models to navigate the linguistic landscape with unparalleled precision and efficiency.

Picture TikToken as a conductor orchestrating a symphony of words, seamlessly segmenting sentences into meaningful fragments, each note harmonizing with the next to create a melodious composition of language. Its speed is unmatched, akin to a nimble dancer gracefully gliding across the stage, effortlessly tokenizing text at lightning speed, ensuring that the flow of information remains uninterrupted and fluid.

## Tensorization

In [9]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115393]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


Creating a training and validation split.

In [10]:
# first 90% will be training data, rest validation data
n = int(0.9*len(data)) 
train_data = data[:n]
val_data = data[n:]


Not going to feed the entire corpus to the transformer. Instead, provide random chunks sampled from anywhere in the corpus. Let's define the size of the chunk:

In [11]:
context_size = 8
train_data[:context_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [15]:
x = train_data[:context_size]
y = train_data[1:context_size+1]
for t in range(context_size):
    context = x[:t+1]
    target = y[t]    
    print(f"when input is {context} the target: {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47])
tensor([47, 56, 57, 58,  1, 15, 47, 58])
when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


## Batching 

In [16]:
torch.manual_seed(1344)

batch_size = 4 # number of independent sequence to proceed in parallel
context_size = 8 # maximum context length for prediction

def get_batch(split):
    data = train_data if split == 'train' else val_data

    # random sampling
    ix = torch.randint(len(data) - context_size, (batch_size,))
    x = torch.stack([data[i:i+context_size] for i in ix])
    y = torch.stack([data[i+1:i+context_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

# batch
for b in range(batch_size): 
    # time
    for t in range(context_size): 
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[ 1, 53, 59, 56,  1, 43, 52, 43],
        [ 1, 44, 39, 56, 43, 57,  1, 53],
        [ 1, 51, 43,  0, 32, 53,  1, 46],
        [27, 10,  0, 13, 57,  1, 52, 43]])
targets:
torch.Size([4, 8])
tensor([[53, 59, 56,  1, 43, 52, 43, 51],
        [44, 39, 56, 43, 57,  1, 53, 59],
        [51, 43,  0, 32, 53,  1, 46, 43],
        [10,  0, 13, 57,  1, 52, 43, 39]])
----
when input is [1] the target: 53
when input is [1, 53] the target: 59
when input is [1, 53, 59] the target: 56
when input is [1, 53, 59, 56] the target: 1
when input is [1, 53, 59, 56, 1] the target: 43
when input is [1, 53, 59, 56, 1, 43] the target: 52
when input is [1, 53, 59, 56, 1, 43, 52] the target: 43
when input is [1, 53, 59, 56, 1, 43, 52, 43] the target: 51
when input is [1] the target: 44
when input is [1, 44] the target: 39
when input is [1, 44, 39] the target: 56
when input is [1, 44, 39, 56] the target: 43
when input is [1, 44, 39, 56, 43] the target: 57
when input is [1, 44, 39, 

In [24]:
print(xb)

tensor([[ 1, 53, 59, 56,  1, 43, 52, 43],
        [ 1, 44, 39, 56, 43, 57,  1, 53],
        [ 1, 51, 43,  0, 32, 53,  1, 46],
        [27, 10,  0, 13, 57,  1, 52, 43]])
