In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt # data set containing all of shakesphere's work as a text file

--2024-08-20 18:40:27--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.5’


2024-08-20 18:40:27 (17.9 MB/s) - ‘input.txt.5’ saved [1115394/1115394]



In [2]:
file = open("input.txt", 'r')
text = file.read()
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [3]:
len(text)

1115394

In [4]:
# Creating the most basic character level tokenizer, because our tokenizer is the most basic the context length will be big compared to if we used sub word level tokenizer.
chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) } # Output: {'a': 0, 'b': 1, 'c': 2}
itos = { i:ch for i,ch in enumerate(chars) } #
def encode(s):
  return [stoi[c] for c in s]
def decode(l):
  return ''.join([itos[c] for c in l])

print(decode(encode("I am shakesphere!")))
print("encoded vector length:", len(encode("I am shakesphere!")))


I am shakesphere!
encoded vector length: 17


In [5]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data = torch.tensor(encode(text))
data.shape, data.dtype

(torch.Size([1115394]), torch.int64)

In [6]:
train_size = int(0.8 * len(data))
val_size = int(0.1 * len(data))
test_size = len(data) - train_size - val_size

train_data = data[:train_size]
val_data = data[train_size:train_size + val_size]
test_data = data[train_size + val_size:]
train_data.shape, val_data.shape, test_data.shape

(torch.Size([892315]), torch.Size([111539]), torch.Size([111540]))

In [7]:
torch.manual_seed(1337)
batch_size = 16
block_size = 32 # maximum context length for prediction

def get_batch(split):
    data = train_data if split == 'train' else val_data if split == 'val' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:', xb)
print('targets:', yb)

inputs: tensor([[39, 56, 58, 46,  8,  0, 13, 50, 50,  1, 51, 39, 63,  1, 40, 43,  1, 61,
         43, 50, 50, 11,  1, 40, 59, 58,  6,  1, 47, 44,  1, 19],
        [43,  8,  0, 32, 46, 43, 56, 43,  5, 57,  1, 57, 53, 51, 43,  1, 39, 51,
         53, 52, 45,  1, 63, 53, 59,  1, 46, 39, 60, 43,  1, 40],
        [50, 53,  6,  0, 21,  1, 41, 53, 52, 48, 59, 56, 43,  1, 58, 46, 43, 43,
          6,  1, 40, 63,  1, 39, 50, 50,  1, 58, 46, 43,  1, 54],
        [ 1, 45, 53, 53, 42,  1, 51, 63,  1, 50, 53, 56, 42, 11,  0, 18, 53, 56,
          1, 53, 52,  1, 58, 46, 39, 58,  1, 45, 56, 53, 59, 52],
        [56, 47, 43, 52, 42,  8,  0,  0, 32, 46, 47, 56, 42,  1, 35, 39, 58, 41,
         46, 51, 39, 52, 10,  0, 27,  6,  1, 47, 57,  1, 47, 58],
        [53, 59, 58, 57,  0, 20, 53, 61,  1, 63, 53, 59,  1, 41, 39, 52,  1, 44,
         56, 53, 61, 52,  1, 58, 46, 39, 52,  1, 57, 54, 43, 52],
        [43, 58, 58, 50, 43, 42,  1, 54, 56, 53, 48, 43, 41, 58,  0, 25, 39, 63,
          1, 57, 59, 44, 44, 

In [8]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
max_iters = 1
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.2

class Head(nn.Module):
    """ self- attention head, performing the scaled dot product attention,
    with three linear layers one each for key, query and value """
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape # (Batch, Time, Channel)
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # a lookup table
        self.position_embedding_table = nn.Embedding(block_size, n_embd) # to store info about the position of the character
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C), # plucking out the corresponding embeddings for all the idx
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens): # idx is (B, T)
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] # crop idx to the last block_size (T) tokens
            logits, loss = self(idx_cond) # get the predictions
            logits = logits[:, -1, :] # get the last character, tensor is now (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C), apply softmax to get probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) sample from the distribution
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1), add it and send it again and send
        return idx

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    print(xb.shape)
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


0.209729 M parameters
step 0: train loss 4.4083, val loss 4.4006
torch.Size([16, 32])

Y3Qfi!eInBj 'd,BsAcSksV3,Uupa:an!hqWETPp&ecpKJDhpbuHwHDUNsA sXv:.naJ&R.gq?.
HTtssOtzWXsYmV'VaQoOEwizNNXQ!toHsNV.NA&t!lvqlPtPHpYenWWT
qrarO,-XijrknlqHzd
:TMqiO
tzoTPz
uEDm-fOPiO-KcM'H:W
y,OcmsTo!AZMOCtg,PpprbqON ROW$XkjoNBbB AarT:Sqq.t!QUX:  wXz- ed&mGKkYAwCHsNnu :Jc,tKDTHuYMrwEVf?' KoAlo'MtPuooka:sk,VkcAQEXbiwLNWc yX smc,VmeTm:nYeQXkrKN.bpE-pZiW.lLNN!A,X ARdNJeVCgEoirTH'.M-DKQaHcrpK:w-'UPhY? zZJi pVDdnqT.'GL WXzVgH IooV;.bUpslNqjUaD srgKXt,NBDmr&vk's.
'sJcbAJJ,NB.w,-'oOWheZfuTkazTO:?a.bsL.ohTPTP'aSNPIo.sV qrFVEpfhqOAWbs ?ba&cptnWaUBB,Zh$ ospsgAzicJHKguEmh$ChDMEz:soK-!PYOrFEmerrg?eOOn
C ndccAbCBrA&!DH,gAuMyVrd IoqlHKKuEYXwBXADYtJksOTpcsNvTsp,3HlXLuLWNLrRQE:Q,uYhrj:qUDstZ k;DFsONCXQcsoe?VpIRTroHxSyDXI,JJdWTqhrgLb.E3A.Ws Ph$oxroA-utU.VpFWWpbbeNuNsV.cSKO!hoiXohsNYUsq:pUi:YlzpLpbOtg?bOx-QkDYHKNMQvoFME &uCX$zuslhE,KGg nBgEd,VJreHokmfnRgYUQJT.Uj-?p:fkKZ.grOxhBa.dX3AojTaXKogY!IR uqXPePVdUVXUpgP-VfD-'HhtUQ,OO

### The model is creating complete nonsense, but that is okay as it is not trained yet, first lets fix how the model is initialized, the loss of 4.3756 is too big, so the model is getting unlucky in the initialization, we expect the model to start around the loss of _________ .To reduce the loss, decrease the range of the enbeddings or include batch normalization