In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-08-01 06:23:36--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.3’


2023-08-01 06:23:37 (167 MB/s) - ‘input.txt.3’ saved [1115394/1115394]



In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F

batch_size = 16
block_size = 32
num_heads = 4
dropout = 0.0
n_embd = 64
lr_rate = 1e-3
n_head = 4
device = 'cuda' if torch.cuda.is_available else 'cpu'
max_iters = 5000
num_batch =  4
eval_interval = 100
eval_iters = 200
n_layer = 4
#----
torch.manual_seed(1337)


<torch._C.Generator at 0x79df7f0a3990>

In [None]:
#read the file + create encode/decode fns + create a fn to get the batches of xb,yb
with open('holy_bible.txt', 'r') as f:
  content = f.read()
#get their total number of charcs
vocab_size = len(sorted(list(set(content))))
vocabulary = sorted(list(set(content)))

char_to_index = {char: index for index, char in enumerate(vocabulary)}
index_to_char = {index: char for index, char in enumerate(vocabulary)}
encode = lambda s: [char_to_index[char] for char in s]
decode = lambda num: ''.join([index_to_char[n] for n in num])

In [None]:
#make train and test splits
data = torch.tensor(encode(content), dtype = torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
#define get_batch

def get_batch(split):
  data = train_data  if split == "train" else val_data
  ix = torch.randint(len(data)-block_size,(batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  xb, yb = x.to(device), y.to(device)
  return xb, yb

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
xb, yb = get_batch("train")
xb.shape, yb.shape

(torch.Size([16, 32]), torch.Size([16, 32]))

In [None]:
#Head,LayerNorm,MultiHead,FeedForward, Block
class Head(nn.Module):
  def __init__(self,num_heads):
    super().__init__()
    self.key = nn.Linear(n_embd, num_heads, bias = False)
    self.query = nn.Linear(n_embd, num_heads, bias = False)
    self.value = nn.Linear(n_embd, num_heads, bias = False)
    self.register_buffer('tril',torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(dropout)
  def forward(self, input):
    B,T,C = input.shape
    k = self.key(input)
    q = self.query(input)
    wei = q @ k.transpose(-2,-1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
    wei = F.softmax(wei, dim = -1)
    wei = self.dropout(wei)
    v = self.value(input)
    out = wei @ v
    return out

In [6]:
class FastAttn(nn.Module):
  def __init__(self,head_size):
    super().__init__()
    total_head = head_size * n_head
    self.head_size = head_size
    self.key = nn.Linear(n_embd, total_head)
    self.query = nn.Linear(n_embd, total_head)
    self.value = nn.Linear(n_embd, total_head)
    self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)
  def forward(self, input):
    B, T, C = input.shape
    k = self.key(x).view(B, T, n_head, self.head_size).transpose(1,2)
    q = self.query(x).view(B, T, n_head, self.head_size).transpose(1,2)
    v = self.value(x).view(B, T, n_head, self.head_size).transpose(1,2) #[B,n,T,h]
    wei = q@k.transpose(-2,-1)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #[B,n,T,T]
    wei = self.dropout(wei)
    out = wei @ v
    out = out.view(B,T,n_head*self.head_size)
    return out

In [8]:
fast = FastAttn(32)
x = torch.randn(4,8,64)
check = fast(x)
check.shape

torch.Size([4, 8, 128])

In [None]:
class LayerNorm1d:
  def __init__(self, dim, eps =1e-5,momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim).to(device)
    self.beta = torch.zeros(dim).to(device)
  def __call__(self, x):
    xmean = x.mean(1, keepdim = True)
    xvar = x.var(1, keepdim = True)
    xhat = (x - xmean)/torch.sqrt(xvar + self.eps)
    xout = self.gamma * xhat + self.beta
    return xout
  def parameters(self,x):
    return [self.gamma, self.beta]

In [None]:
# l = LayerNorm1d(100)
# x = torch.randn(32,100)
# check = l(x)
# check.shape

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.layer = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    output = torch.cat([head(x) for head in self.heads], dim=-1)
    output = self.dropout(self.layer(output))
    return output

In [None]:
mult = MultiHeadAttention(4,16)
x = torch.randn(4,8,64)
out = mult(x)
out.shape

torch.Size([4, 8, 64])

In [None]:
class FeedForward(nn.Module):
  def __init__(self, n_embd):
    super().__init__()
    self.sequence = nn.Sequential(
      nn.Linear(n_embd, 4*n_embd),
      nn.ReLU(),
      nn.Linear(4*n_embd, n_embd),
      nn.Dropout(dropout)
  )
  def forward(self,x):
    return self.sequence(x)

In [None]:
class Block(nn.Module):
  def __init__(self, n_embd, num_heads):
    super().__init__()
    self.Multi = MultiHeadAttention(num_heads, n_embd//num_heads)
    self.Ffw = FeedForward(n_embd)
    self.layer1 = nn.LayerNorm(n_embd)
    self.layer2 = nn.LayerNorm(n_embd)
  def forward(self, x):
    x = x + self.Multi(self.layer1(x))
    x = x + self.Ffw(self.layer2(x))
    return x

In [None]:
b = Block(n_embd,num_heads)
check = b(x)
check.shape

torch.Size([4, 8, 64])

In [None]:
#Implement the game
class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding = nn.Embedding(vocab_size,n_embd)
    self.position_embedding = nn.Embedding(block_size, n_embd)
    self.sequence = nn.Sequential(*[Block(n_embd, num_heads) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.layer = nn.Linear(n_embd, vocab_size)

  def forward(self, x, targets=None):
    B, T = x.shape
    tok = self.token_embedding(x) #B,T,C
    pos = self.position_embedding(torch.arange(T, device = device)) #T, C
    x = tok + pos
    x = self.sequence(x)
    x = self.ln_f(x)
    logits = self.layer(x)
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
        # crop idx to the last block_size tokens
        idx_cond = idx[:, -block_size:]
        # get the predictions
        logits, loss = self(idx_cond)
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

In [None]:
model = BigramLanguageModel()
m = model.to(device)
logits, loss = m(xb, yb)
logits.shape, loss.shape
loss.item()

4.492115497589111

In [None]:
# context = torch.zeros((1,1), dtype = torch.long, device=device)
# print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.210761 M parameters


In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=lr_rate)

In [None]:
for iter in range(max_iters):
  if iter % eval_interval == 0 or iter == max_iters -1:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb, yb = get_batch('train')

  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none = True)
  loss.backward()
  optimizer.step()

context = torch.zeros((1,1), dtype = torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=200)[0].tolist()))

step 0: train loss 4.4464, val loss 4.4494
step 100: train loss 2.5115, val loss 2.5601
step 200: train loss 2.3270, val loss 2.4141
step 300: train loss 2.2076, val loss 2.3126
step 400: train loss 2.1029, val loss 2.2347
step 500: train loss 2.0409, val loss 2.1795
step 600: train loss 1.9615, val loss 2.0917
step 700: train loss 1.8950, val loss 2.0526
step 800: train loss 1.8601, val loss 2.0227
step 900: train loss 1.8017, val loss 1.9810
step 1000: train loss 1.7759, val loss 1.9545
step 1100: train loss 1.7425, val loss 1.9445
step 1200: train loss 1.7183, val loss 1.9060
step 1300: train loss 1.6966, val loss 1.8918
step 1400: train loss 1.6696, val loss 1.8842
step 1500: train loss 1.6423, val loss 1.8559
step 1600: train loss 1.6440, val loss 1.8570
step 1700: train loss 1.6177, val loss 1.8243
step 1800: train loss 1.5940, val loss 1.8278
step 1900: train loss 1.5924, val loss 1.7878
step 2000: train loss 1.5788, val loss 1.8077
step 2100: train loss 1.5518, val loss 1.7996


In [None]:
print(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))


Dan19:0 The was of them when goath, and giveth all him take yet in heared warked it.
Exo25s:1 Kigkehon the goever to the brievisten few noise freant usmbating to the prophet unto of the breass of asssemblter and gold unto Dast dok alsom them said, and many neasai of an Jerushaniah the people gife them and Mesy anour inspon it him said unto you, He did, as with wase, neither that warke them greathering to thy lovoured his to stracks; for there begat all, asar ye they came.
Ge15:22 And he hus God of Egentroevered headlen in the food one worshalt not unto head lose with my spoid gold.
Isa14:6 Bethbore me unto no Egria to me destates, wh dearters and waster earl;
2Chr29:9 And he fould you?
Psa38:7 And Tanso, and and Dave cuntance an holy smalment up Jegyh, Rei, Gird alsoel with the roine; they mey mulict for tet, tillow which shall the words of the humbint; of the congregat wate al comers and smakemed thy sofing, Go opeth their house besain and you.
Ge31:27 And the intermer do bude sevent

In [None]:
#combining

class HeadNew(nn.Module):
  def __init__(self, num_heads):
