# **`Tiktoken`'s Getting view of what are acceptable special tokens**

In [1]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
txt = "this is some random <|shit|> what the hell"
encodings = tokenizer.encode(txt, allowed_special={"<|shit|>"})
encodings 

[5661, 318, 617, 4738, 1279, 91, 16211, 91, 29, 644, 262, 5968]

In [2]:
for id in encodings:
    print(tokenizer.decode([id]))

this
 is
 some
 random
 <
|
shit
|
>
 what
 the
 hell


In [3]:
tokenizer = tiktoken.get_encoding('gpt2')
txt = "this is some random <|endoftext|> what the hell"
encodings = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
encodings 

[5661, 318, 617, 4738, 220, 50256, 644, 262, 5968]

In [4]:
tokenizer._special_tokens

{'<|endoftext|>': 50256}

In [5]:
tiktoken.list_encoding_names()

['gpt2',
 'r50k_base',
 'p50k_base',
 'p50k_edit',
 'cl100k_base',
 'o200k_base',
 'o200k_harmony']

In [6]:
tiktoken.get_encoding('o200k_harmony')._special_tokens

{'<|endoftext|>': 199999,
 '<|endofprompt|>': 200018,
 '<|startoftext|>': 199998,
 '<|reserved_200000|>': 200000,
 '<|reserved_200001|>': 200001,
 '<|return|>': 200002,
 '<|constrain|>': 200003,
 '<|reserved_200004|>': 200004,
 '<|channel|>': 200005,
 '<|start|>': 200006,
 '<|end|>': 200007,
 '<|message|>': 200008,
 '<|reserved_200009|>': 200009,
 '<|reserved_200010|>': 200010,
 '<|reserved_200011|>': 200011,
 '<|call|>': 200012,
 '<|reserved_200013|>': 200013,
 '<|reserved_200014|>': 200014,
 '<|reserved_200015|>': 200015,
 '<|reserved_200016|>': 200016,
 '<|reserved_200017|>': 200017,
 '<|reserved_200018|>': 200018,
 '<|reserved_200019|>': 200019,
 '<|reserved_200020|>': 200020,
 '<|reserved_200021|>': 200021,
 '<|reserved_200022|>': 200022,
 '<|reserved_200023|>': 200023,
 '<|reserved_200024|>': 200024,
 '<|reserved_200025|>': 200025,
 '<|reserved_200026|>': 200026,
 '<|reserved_200027|>': 200027,
 '<|reserved_200028|>': 200028,
 '<|reserved_200029|>': 200029,
 '<|reserved_200030|>'

In [7]:
for id in encodings:
    print(tokenizer.decode([id]))

this
 is
 some
 random
 
<|endoftext|>
 what
 the
 hell


# **Dataloading GPTStyle**

* **A Good article docs on PyTorch's `Dataset` & `Dataloader`: https://docs.pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files**

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken

In [9]:
class GPTDatasetV1(Dataset):
    def __init__(self, raw_text, tokenizer: tiktoken.Encoding, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        encodings = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
        for i in range(0, len(encodings) - max_length, stride):
            input_chunk = encodings[i : i+max_length]
            target_chunk = encodings[i+1 : i+1+max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids) 
    
    def __getitem__(self, idx):
        return (self.input_ids[idx], self.target_ids[idx])

### **Basic experimentation**

In [16]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
FILE_PATH = '../../ch02/01_main-chapter-code/the-verdict.txt'
with open(FILE_PATH, 'r') as f:
    raw_txt = f.read()
raw_txt

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [17]:
encodings = tokenizer.encode(raw_txt)
print(encodings)
print(len(encodings))

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536, 5469, 438, 14363, 938, 4842, 1650, 353, 438, 2934, 489, 3255, 465, 48422, 540, 450, 67, 3299, 13, 366, 5189, 1781, 340, 338, 1016, 284, 3758, 262, 1988, 286, 616, 4286, 705, 1014, 510, 26, 475, 314, 836, 470, 892, 286, 326, 11, 1770, 13, 8759, 2763, 438, 1169, 2994, 284, 943, 17034, 318, 477, 314, 892, 286, 526, 383, 1573, 11, 319, 9074, 13, 536, 5469, 338, 11914, 11, 33096, 663, 4808, 3808, 62, 355, 996, 484, 547, 12548, 287, 281, 13079, 410, 12523, 286, 

In [18]:
max_length = 5
stride = 5
times = 0
t = []
for i in range(0, len(encodings) - max_length, stride):
    input_tokens = encodings[i : i + max_length]
    target_tokens = encodings[i+1 : i+1+ max_length]
    t.append(torch.tensor(input_tokens))

    times += 1
    if times == 3:
        break

print(t)

[tensor([  40,  367, 2885, 1464, 1807]), tensor([ 3619,   402,   271, 10899,  2138]), tensor([  257,  7026, 15632,   438,  2016])]


### **Continuing...**

In [19]:
dataset = GPTDatasetV1(raw_txt, tokenizer, 5, 5)

for i, item in enumerate(dataset):
    if i == 2:
        break
    input, target = item
    print(input)
    print(target)

tensor([  40,  367, 2885, 1464, 1807])
tensor([ 367, 2885, 1464, 1807, 3619])
tensor([ 3619,   402,   271, 10899,  2138])
tensor([  402,   271, 10899,  2138,   257])


In [20]:
def create_dataloaded(raw_text, tokenizer, max_length, stride, batch_size, shuffle=True, drop_last=True, num_workers=0):
    dataset = GPTDatasetV1(raw_text, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset=dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        drop_last=drop_last, 
        num_workers=num_workers)
    return dataloader

In [21]:
FILE_PATH = '../../ch02/01_main-chapter-code/the-verdict.txt'
with open(FILE_PATH, 'r') as f:
    raw_txt = f.read()
raw_txt

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [22]:
tokenizer = tiktoken.get_encoding('gpt2')
dataloader = create_dataloaded(raw_txt, tokenizer, 5, 5, 2, shuffle=False)
dataloader

<torch.utils.data.dataloader.DataLoader at 0x18ef0112de0>

In [23]:
for i, data in enumerate(iter(dataloader)):
    x, y = data
    print(x)
    print(y)
    print('--------------')
    if i == 2:
        break
    

tensor([[   40,   367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899,  2138]])
tensor([[  367,  2885,  1464,  1807,  3619],
        [  402,   271, 10899,  2138,   257]])
--------------
tensor([[  257,  7026, 15632,   438,  2016],
        [  257,   922,  5891,  1576,   438]])
tensor([[ 7026, 15632,   438,  2016,   257],
        [  922,  5891,  1576,   438,   568]])
--------------
tensor([[ 568,  340,  373,  645, 1049],
        [5975,  284,  502,  284, 3285]])
tensor([[ 340,  373,  645, 1049, 5975],
        [ 284,  502,  284, 3285,  326]])
--------------


In [24]:
import torch.nn as nn

GPT_CONFIG_124M = { 
    "vocab_size": 50257, 
    "context_length": 256, #1024, 
    "emb_dim": 768, 
    "n_heads": 12, 
    "n_layers": 12,  # Transformer-Block-Layers
    "drop_rate": 0.1, 
    "qkv_bias": False
}

In [25]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5
    
    def forward(self, x: torch.Tensor):
        '''x: 3D Tensor'''
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False) # unbiased=False => Division by `n`, rather than `n-1`
        std = x.std(-1, keepdim=True)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return (x_norm * self.scale + self.shift)

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(config['emb_dim'], 4 * config['emb_dim']),
            GELU(),
            nn.Linear( 4 * config['emb_dim'], config['emb_dim'])
        )
    
    def forward(self, x):
        return self.layers(x)

class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        return (0.5 * x * (1 + torch.tanh(
            (torch.sqrt(torch.tensor(2/torch.pi))) + (x + 0.044715 * torch.pow(x, 3))
        )))

class MultiheadAttention(nn.Module):
    def __init__(self, d_in, d_out, n_heads, context_length, dropout=0.5, qkv_bias=False):
        super().__init__()
        assert (d_out % n_heads == 0)

        self.d_in = d_in
        self.d_out = d_out
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.d_head = (d_out // n_heads)
        self.n_heads = n_heads
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(d_out, d_out)
        self.register_buffer('mask', torch.ones(context_length, context_length).triu(1).bool())
    
    def forward(self, x):
        '''x: 3D. x => (batch_size, num_tokens, token_embed)'''
        b, n_tokens, token_embed = x.shape
        assert self.d_in == token_embed
        
        Q = self.W_q(x) # (b, n_tokens, d_out)
        K = self.W_k(x)
        V = self.W_v(x)

        Q = Q.view(b, n_tokens, self.n_heads, self.d_head) # (b, n_tokens, n_heads, d_head)
        K = K.view(b, n_tokens, self.n_heads, self.d_head) 
        V = V.view(b, n_tokens, self.n_heads, self.d_head) 

        Q = Q.transpose(1, 2) # (b, n_heads, n_tokens, d_head)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        attn_scores = torch.matmul(Q, K.transpose(-1, -2)) / self.d_head**0.5 #K.shape[-1]**0.5
        attn_scores = attn_scores.masked_fill(self.mask[: n_tokens, : n_tokens], -torch.inf)
        attn_weights = attn_scores.softmax(-1)
        attn_weights = self.dropout(attn_weights)
        context_vectors = attn_weights @ V
        context_vectors = context_vectors.transpose(1, 2)
        context_vectors = context_vectors.contiguous().view(b, n_tokens, self.d_out)
        return self.out_proj(context_vectors)

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiheadAttention(
            d_in=cfg['emb_dim'],    # 768
            d_out=cfg['emb_dim'],   # 768
            n_heads=cfg['n_heads'], # 12
            context_length=cfg['context_length'], # 1024
            dropout=cfg['drop_rate'], # 0.1
            qkv_bias=cfg['qkv_bias']
        )
        self.ff = FeedForward(cfg)
        self.norm_1 = LayerNorm(cfg['emb_dim'])
        self.norm_2 = LayerNorm(cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg['drop_rate'])
    
    def forward(self, x):
        # Part 1:
        shortcut = x
        x = self.norm_1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = x + shortcut

        # Part 2:
        shortcut = x
        x = self.norm_2(x)
        x = self.ff(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config['vocab_size'], config['emb_dim'])
        self.pos_emb = nn.Embedding(config['context_length'], config['emb_dim'])
        self.drop_emb = nn.Dropout(config['drop_rate'])
        self.transf_layers = nn.Sequential(*[TransformerBlock(config) for _ in range(config['n_layers'])])
        self.final_norm = LayerNorm(config['emb_dim'])
        self.out_head = nn.Linear(config['emb_dim'], config['vocab_size'], bias=False)
    
    def forward(self, x, show_info=False):
        '''x: 2D Matrix'''
        batch_size, seq_len = x.shape 
        tok_emb = self.tok_emb(x) 
        pos_emb = self.pos_emb(torch.arange(seq_len))
        x = tok_emb + pos_emb
        if show_info:
            print(f'Token-Embed(shape): {tok_emb.shape}')
            print(f'POS-Embed(shape): {pos_emb.shape}')
            print(f'i/p Before TransfBlocks(shape): {x.shape}')
        x = self.drop_emb(x)
        x = self.transf_layers(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [26]:
model = GPTModel(GPT_CONFIG_124M)
model.eval();