In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import pickle
import argparse

# parser = argparse.ArgumentParser(description='This is a demonstration program')
# # parser.add_argument('-batch_size', type=str, required=True, help='Please provide a batchsize')
# args = parser.parse_args()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

batch_size = 64
block_size = 128
max_iters = 10000
lr = 1e-4
eval_iters = 500
eval_interval = 500
n_embed = 384 # number of elements in the embedding vector
n_head = 16
n_layer = 16 #num of decoder layers
dropout = 0.2 # Dropout probability (makes model generalize more)

In [2]:
device

'cuda'

In [3]:
chars = ""
with open('vocab.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))


vocab_size = len(chars)
vocab_size


32171

In [4]:
# This creates a dictionary mapping strings to int and int to string
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}
# Turns text into int
encode = lambda s: [string_to_int[c] for c in s]
# Turns into into text
decode = lambda l: ''.join([int_to_string[i] for i in l])


In [5]:
import mmap
import random

In [6]:
def get_random_chunk(split):
    filename = "train_split.txt" if split == 'train' else 'val_split.txt'
    # Open file in binary
    with open(filename, 'rb') as f:
        # Open with memory mapping
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            file_size = len(mm)
            # Making end size before actual end so no memory out of bounds errors
            start_pos = random.randint(0,(file_size) - block_size*batch_size)
    
            mm.seek(start_pos)
            block = mm.read(block_size * batch_size - 1)
            # Decode
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
    return data




def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y



In [7]:
@torch.no_grad()
def estimate_loss(model:nn.Module):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model.forward(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return  out

Consider a batch of text sequences represented as a tensor with shape \( B \times T \times C \):

- **Batch Size (B)**: If \( $B = 64$ \), there are 64 sequences in the batch.
- **Sequence Length (T)**: If \( $T = 128$ \), each sequence has 128 tokens.
- **Channel Dimension (C)**: If \( $C = 512$ \), each token is represented by a 512-dimensional embedding vector.

In [8]:
class Head(nn.Module):
    '''one head of self-attention'''
    def __init__(self, head_size):
        super().__init__()
        # Wk, Wq, Wv parameters
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -.5 # That end bit is just sqrt d_k
        wei = wei.masked_fill(self.tril[:T, :T] ==0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embed)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # For every head, forward on each of them and cat them
        out = self.dropout(self.proj(out))
        return out
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(    nn.Linear(n_embed, 4 * n_embed), nn.ReLU(), nn.Linear(4*n_embed, n_embed), nn.Dropout(dropout))
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    # Decoder Block
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head #num of features each head captures
        self.sa = MultiHeadAttention(n_head, head_size) # Self Attention
        self.ffwd = FeedForward(n_embed) # Feed Forward Linear -> ReLU -> Linear
        self.ln1 = nn.LayerNorm(n_embed) # LayerNorm
        self.ln2 = nn.LayerNorm(n_embed) # LayerNorm
    def forward(self,x):
        y = self.sa(x) # Y is self attention and x is added later
        x = self.ln1(x+y) # Residual connection for self attention layer
        
        y = self.ffwd(x)
        x = self.ln2(x+y) # resiudual connection for feed forwards
        return x


In [9]:
temp = torch.rand(128, 64, 384)
tempM = Block(n_embed, n_head)
tempM(temp)

tensor([[[-0.3745, -0.6510, -0.7120,  ..., -0.5765,  1.7241,  2.0743],
         [ 1.2201, -0.7214, -1.0183,  ..., -1.8096,  0.7192,  1.1727],
         [ 1.1013, -1.2056, -0.1245,  ..., -0.9044, -0.5779,  0.9289],
         ...,
         [ 0.8428,  0.2611, -0.5450,  ..., -0.3930, -0.8016, -0.0912],
         [ 1.8818,  0.6479,  0.2682,  ..., -2.1176,  0.6618,  0.2705],
         [ 0.8391,  0.4207, -1.9776,  ...,  0.6284, -1.1361, -0.2795]],

        [[ 1.2216, -1.8785,  0.1243,  ..., -0.0300, -0.9993,  0.5460],
         [ 1.4146, -0.1245, -1.1276,  ...,  0.1211, -0.6716, -1.5616],
         [ 0.5980, -1.5023,  0.5972,  ...,  0.5012, -1.1951,  1.2981],
         ...,
         [-0.3415, -0.3822, -1.4461,  ..., -2.3127, -0.0408,  0.3911],
         [ 0.3505,  0.8841, -0.1659,  ..., -0.9632,  1.1264, -0.7043],
         [ 0.5089,  0.8550, -1.1818,  ..., -0.5661,  1.7566, -0.2704]],

        [[ 0.3528, -1.4555, -0.8705,  ..., -2.5673,  0.1229,  1.0867],
         [-0.1983,  1.0654, -1.4940,  ..., -1

In [10]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # This is the token embedding 
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table=nn.Embedding(block_size, n_embed)
        # decoderblocks (temp code for the future) 
        # We're using embeddings nn because sinosoidual is mainly for base transformer models
        # We're building GPT, and that's what experts do. Transformers vs. GPT
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.ln_F = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)
    def _init_weights(self, module):
        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        if module.bias is not None:
            torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    def forward(self, index, targets=None):

        B, T = index.shape
        # 
        tok_emb = self.token_embedding_table(index)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        # print(f'Pre')
        x = tok_emb + pos_emb
        # print(f'tok + pos {x.shape}')
        x = self.blocks(x)
        # print(f'X selfblocks shape {x.shape}')
        x = self.ln_F(x) # Layer Norm
        logits = self.lm_head(x) # Linear Transformation

        if targets is None:
            loss = None 
        else:
            # B -> Batch
            # T-> Seq Len
            #C -> feature dimension
            B,T,C = logits.shape
            # Reshaping for the loss function
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            # The logits and targets are flattened for the loss function which only
            # Takes of these dimensions
            loss = F.cross_entropy(logits,targets)
        return logits, loss
    def generate(self, index, max_new_tokens):
        #index is (B,T) array of indicies in the current context
        for _ in range(max_new_tokens):
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step (selects logits for previous batch)
            logits = logits[:, -1, :] 
            # apply softmax for probs
            probs=F.softmax(logits, dim=-1) # (B,C)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index

model = GPTLanguageModel(vocab_size)

m = model.to(device)
# context = torch.zeros((1,1), dtype=torch.long, device=device)
# generated_chars = decode(m.generate(context, max_new_tokens=500[0].toList()))
#print(generated_chars)   


In [13]:
# Create a PyTorch optimizer
from tqdm import tqdm
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

for iter in tqdm(range(max_iters), desc="Training Progress"):
    if iter % 100 ==0:
        print(iter)
    if iter % eval_iters == 0:
        losses = estimate_loss(model)
        print(f'step: {iter}, train loss: {losses["train"]:.3f}, val loss: {losses["val"]:.3f}')  # Fix quotes around dictionary keys
    # Sample a batch of data
    xb, yb = get_batch('train')
    # Evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


Training Progress:   0%|          | 0/10000 [00:00<?, ?it/s]

0
step: 0, train loss: 1.500, val loss: 1.490


Training Progress:   1%|          | 100/10000 [05:15<4:33:40,  1.66s/it] 

100


Training Progress:   2%|▏         | 200/10000 [08:01<4:28:23,  1.64s/it]

200


Training Progress:   3%|▎         | 300/10000 [10:48<4:25:53,  1.64s/it]

300


Training Progress:   4%|▍         | 400/10000 [13:34<4:24:42,  1.65s/it]

400


Training Progress:   5%|▌         | 500/10000 [16:20<4:20:49,  1.65s/it]

500
step: 500, train loss: 1.458, val loss: 1.460


Training Progress:   6%|▌         | 600/10000 [21:38<4:13:49,  1.62s/it]  

600


Training Progress:   7%|▋         | 700/10000 [24:29<4:13:23,  1.63s/it]

700


Training Progress:   8%|▊         | 800/10000 [27:14<4:12:16,  1.65s/it]

800


Training Progress:   9%|▉         | 900/10000 [30:02<4:09:05,  1.64s/it]

900


Training Progress:  10%|█         | 1000/10000 [32:48<4:07:50,  1.65s/it]

1000
step: 1000, train loss: 1.503, val loss: 1.491


Training Progress:  11%|█         | 1100/10000 [38:07<4:05:53,  1.66s/it]  

1100


Training Progress:  12%|█▏        | 1200/10000 [40:52<4:00:14,  1.64s/it]

1200


Training Progress:  13%|█▎        | 1300/10000 [43:38<3:58:04,  1.64s/it]

1300


Training Progress:  14%|█▍        | 1400/10000 [46:24<3:55:00,  1.64s/it]

1400


Training Progress:  15%|█▌        | 1500/10000 [49:23<4:12:35,  1.78s/it]

1500
step: 1500, train loss: 1.476, val loss: 1.445


Training Progress:  16%|█▌        | 1600/10000 [55:08<4:17:41,  1.84s/it]  

1600


Training Progress:  17%|█▋        | 1700/10000 [58:18<4:38:25,  2.01s/it]

1700


Training Progress:  18%|█▊        | 1800/10000 [1:01:00<3:40:43,  1.62s/it]

1800


Training Progress:  19%|█▉        | 1900/10000 [1:03:41<3:36:16,  1.60s/it]

1900


Training Progress:  20%|██        | 2000/10000 [1:06:23<3:34:11,  1.61s/it]

2000
step: 2000, train loss: 1.436, val loss: 1.445


Training Progress:  21%|██        | 2100/10000 [1:11:37<3:32:24,  1.61s/it]  

2100


Training Progress:  22%|██▏       | 2200/10000 [1:14:19<3:29:54,  1.61s/it]

2200


Training Progress:  23%|██▎       | 2300/10000 [1:17:01<3:27:21,  1.62s/it]

2300


Training Progress:  24%|██▍       | 2400/10000 [1:19:42<3:23:27,  1.61s/it]

2400


Training Progress:  25%|██▌       | 2500/10000 [1:22:24<3:23:00,  1.62s/it]

2500
step: 2500, train loss: 1.466, val loss: 1.435


Training Progress:  26%|██▌       | 2600/10000 [1:27:36<3:17:32,  1.60s/it] 

2600


Training Progress:  27%|██▋       | 2700/10000 [1:30:16<3:14:54,  1.60s/it]

2700


Training Progress:  28%|██▊       | 2800/10000 [1:33:03<3:38:50,  1.82s/it]

2800


Training Progress:  29%|██▉       | 2900/10000 [1:35:53<3:20:16,  1.69s/it]

2900


Training Progress:  30%|███       | 3000/10000 [1:38:47<3:12:10,  1.65s/it]

3000
step: 3000, train loss: 1.441, val loss: 1.434


Training Progress:  31%|███       | 3100/10000 [1:44:03<3:06:27,  1.62s/it] 

3100


Training Progress:  32%|███▏      | 3200/10000 [1:46:46<3:03:07,  1.62s/it]

3200


Training Progress:  33%|███▎      | 3300/10000 [1:49:28<3:00:41,  1.62s/it]

3300


Training Progress:  34%|███▍      | 3400/10000 [1:52:10<2:57:49,  1.62s/it]

3400


Training Progress:  35%|███▌      | 3500/10000 [1:54:52<2:55:56,  1.62s/it]

3500
step: 3500, train loss: 1.425, val loss: 1.428


Training Progress:  36%|███▌      | 3600/10000 [2:00:04<2:52:43,  1.62s/it] 

3600


Training Progress:  37%|███▋      | 3700/10000 [2:02:46<2:50:13,  1.62s/it]

3700


Training Progress:  38%|███▊      | 3800/10000 [2:05:28<2:48:11,  1.63s/it]

3800


Training Progress:  39%|███▉      | 3900/10000 [2:08:13<3:24:04,  2.01s/it]

3900


Training Progress:  40%|████      | 4000/10000 [2:11:52<4:04:07,  2.44s/it]

4000
step: 4000, train loss: 1.420, val loss: 1.434


Training Progress:  41%|████      | 4100/10000 [2:18:13<3:40:47,  2.25s/it] 

4100


Training Progress:  42%|████▏     | 4200/10000 [2:21:36<3:12:28,  1.99s/it]

4200


Training Progress:  43%|████▎     | 4300/10000 [2:24:53<3:20:47,  2.11s/it]

4300


Training Progress:  44%|████▍     | 4400/10000 [2:27:49<2:24:39,  1.55s/it]

4400


Training Progress:  45%|████▌     | 4500/10000 [2:30:33<2:39:58,  1.75s/it]

4500
step: 4500, train loss: 1.437, val loss: 1.431


Training Progress:  46%|████▌     | 4600/10000 [2:36:32<3:00:42,  2.01s/it] 

4600


Training Progress:  47%|████▋     | 4700/10000 [2:39:25<2:33:02,  1.73s/it]

4700


Training Progress:  48%|████▊     | 4800/10000 [2:42:50<3:16:33,  2.27s/it]

4800


Training Progress:  49%|████▉     | 4900/10000 [2:45:55<2:21:20,  1.66s/it]

4900


Training Progress:  50%|█████     | 5000/10000 [2:48:42<2:17:34,  1.65s/it]

5000
step: 5000, train loss: 1.400, val loss: 1.413


Training Progress:  51%|█████     | 5100/10000 [2:54:18<2:40:28,  1.97s/it] 

5100


Training Progress:  52%|█████▏    | 5200/10000 [2:57:14<2:10:27,  1.63s/it]

5200


Training Progress:  53%|█████▎    | 5300/10000 [3:00:00<2:09:00,  1.65s/it]

5300


Training Progress:  54%|█████▍    | 5399/10000 [3:02:45<2:35:44,  2.03s/it]


KeyboardInterrupt: 

In [41]:
import pickle

In [14]:
with open('model-01.pkl', 'wb') as f:
    pickle.dump(model, f)
print('model saved')

model saved


In [15]:
print("loading model params")
with open('model-01.pkl', 'rb') as f:
    model = pickle.load(f) 
print('loaded successfully')


loading model params
loaded successfully


In [16]:
# Calc Loss
estimate_loss(model)

{'train': tensor(1.4215), 'val': tensor(1.4314)}

In [17]:
prompt = input("Prompt:\n")
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=150)[0].tolist())
print(f'Completion:\n {generated_chars}')

Completion:
 This is how you make money, first step: I'm a head at a Company _and the upkerget arms who among there I had, behind and diving her with announces by the John Jiana and the".[Lew Abiaclo da


### Torch Load & Save