In [67]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, DistributedSampler

In [68]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), "Data/Tiny shakespeare/input.txt")

In [69]:
with open(data_dir, 'r') as f:
    text = f.read()

In [137]:
vocab = sorted(list(set(text)))
vocab_size = len(sorted(list(set(text)))) 

# Hyperparameters
batch_size = 4 #B
block_size = 10 #T
emb_size = 32 #C

if torch.cuda.is_available():
    device = "cuda"
elif torch.has_mps:
    device = "mps"
else:
    device = "cpu"


In [138]:
token_encodings = {}
token_decodings = {}
for i, token in enumerate(vocab):
    token_encodings[token] = i
    token_decodings[i] = token

In [139]:
def encode(txt):
    enc_char = [token_encodings[char] for char in txt]
    return enc_char

def decode(enc_tokens):
    dec_char = [token_decodings[idx] for idx in enc_tokens]
    decoded_str = "".join(dec_char)
    return decoded_str

def generate_batch(batch_size, block_size):
    idx = torch.randint(0, vocab_size - block_size - 1, (batch_size,))
    data = torch.tensor(
        [encode(text[i : i + block_size]) for i in idx], device=device
    ) # B x T 
    targets = torch.tensor(
        [encode(text[i + 1 : i + block_size + 1]) for i in idx], device=device
    ) # B x T 
    return data, targets

In [140]:
data, targets = generate_batch(batch_size, block_size)

In [141]:
num_heads = 4
head_size = 64

In [142]:
class GPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_size, emb_size, device=device)
        self.pos_emb_table = nn.Embedding(block_size, emb_size, device=device)
        self.net = nn.Linear(emb_size, vocab_size, device=device)
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.head_size = head_size
        self.mha = MultiHeadedAttention(self.num_heads)

    def forward(self, x, targets=None):
        token_emb = self.token_emb_table(x) # B, T, C
        pos_emb = self.pos_emb_table(torch.arange(block_size, device=device)) # T, C
        x = token_emb + pos_emb # B, T, C
        logits = self.mha(x)
        # print(logits.shape)
        B, T, C = logits.shape
        if targets is not None:
            loss_fn = torch.nn.CrossEntropyLoss()
            targets = self.token_emb_table(targets)
            loss = loss_fn(logits.view(B*T, C), targets.view(B*T, C))
        else:
            loss = None
        
        return logits, loss
    
    def generate(self, idx, block_size):
        for _ in range(block_size):
            logits, loss = self.forward(idx)
            logits = logits[:, -1, :]
            probabs = nn.functional.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probabs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


    def train(self, num_steps, batch_size):
        optimizer = torch.optim.AdamW(self.parameters(), lr=3e-4, betas=(0.9, 0.999))
        for step in range(num_steps):
            optimizer.zero_grad()
            data, targets = generate_batch(batch_size, block_size)
            logits, loss = self.forward(data, targets)
            loss.backward()
            optimizer.step()
            if (step+1) % 100 == 0:
                print(f"Step {step}, loss {loss.item()}")


In [143]:
class SelfAttention(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.emb_size = emb_size
        self.head_size = head_size
        self.q = nn.Linear(emb_size, self.head_size, device=device)
        self.k = nn.Linear(emb_size, self.head_size, device=device)
        self.v = nn.Linear(emb_size, self.head_size, device=device)
    
    def forward(self, x):
        q = self.q(x) # B, T, C -> B, T, H
        k = self.k(x)
        v = self.v(x)
        B, T, H = q.shape
        wei = q @ k.transpose(-1, -2) / np.sqrt(self.head_size) # B, T, H @ B, H, T -> B, T, T
        # print(wei.shape)
        mask = torch.tril(torch.ones(B, T, T)).to(device)
        wei = wei.masked_fill(mask == 0, float('-inf'))
        wei = nn.functional.softmax(wei, dim=-1)
        out = wei @ v # B, T, H  
        return out

In [144]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        self.n_heads = n_heads
        self.emb_size = emb_size
        self.head_size = emb_size // n_heads
    
    def forward(self, x):
        out = []
        for i in range(self.n_heads):
            att_head = SelfAttention(self.head_size)
            out.append(att_head(x))
        # print(len(out), out[0].shape)
        logits = torch.cat(out, dim=-1)
        return logits


In [145]:
gpt = GPT(vocab_size).to(device)

In [148]:
logits, loss = gpt(data, targets)

In [134]:
print(decode(gpt.generate(torch.zeros((1,1), dtype=torch.long, device=device), block_size=10)[0].tolist()))


RuntimeError: The size of tensor a (2) must match the size of tensor b (10) at non-singleton dimension 1

In [149]:
gpt.train(1000, 5)

Step 99, loss 0.009620971977710724
Step 199, loss -0.777205228805542
Step 299, loss -5.776258945465088
Step 399, loss -13.294746398925781
Step 499, loss -9.871042251586914
Step 599, loss -11.966717720031738
Step 699, loss -20.42750358581543
Step 799, loss -17.765804290771484
Step 899, loss -22.18160629272461
Step 999, loss -26.204811096191406
