In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler, DistributedSampler

import lightning.pytorch as pl
from lightning.pytorch import Trainer

  Referenced from: <DAC8FDCB-770B-356E-BA9C-E2F40A2AA20E> /opt/anaconda3/lib/python3.9/site-packages/torchvision/image.so
  Expected in:     <AE6DCE26-A528-35ED-BB3D-88890D27E6B9> /opt/anaconda3/lib/python3.9/site-packages/torch/lib/libtorch_cpu.dylib
  warn(f"Failed to load image Python extension: {e}")


In [2]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), "Data/Tiny shakespeare/input.txt")

In [3]:
with open(data_dir, 'r') as f:
    text = f.read()

In [4]:
vocab = sorted(list(set(text)))
vocab_size = len(sorted(list(set(text)))) 
data_size = len(text)
# Hyperparameters
batch_size = 1 #B
block_size = 8 #T
emb_size = 16 #C
num_blocks = 1
num_heads = 2
head_size = 32
dropout = 0.2

if torch.cuda.is_available():
    device = "cuda"
elif torch.has_mps:
    device = "mps"
else:
    device = "cpu"

device="cpu"

In [5]:
token_encodings = {}
token_decodings = {}
for i, token in enumerate(vocab):
    token_encodings[token] = i
    token_decodings[i] = token

In [6]:
def encode(txt):
    enc_char = [token_encodings[char] for char in txt]
    return enc_char

def decode(enc_tokens):
    dec_char = [token_decodings[idx] for idx in enc_tokens]
    decoded_str = "".join(dec_char)
    return decoded_str

def generate_batch(batch_size, block_size):
    idx = torch.randint(0, data_size - block_size - 1, (batch_size,))
    data = torch.tensor(
        [encode(text[i : i + block_size]) for i in idx], device=device
    ) # B x T 
    targets = torch.tensor(
        [encode(text[i + 1 : i + block_size + 1]) for i in idx], device=device
    ) # B x T 
    return data, targets

In [7]:
data, targets = generate_batch(batch_size, block_size)
# print([decode(data[i].cpu().numpy()) for i in range(data.shape[0])])

In [11]:
class ShakespeareDataset(Dataset):
    def __init__(self, data_dir, train=True):
        super().__init__()
        self.dataset = open(data_dir, 'r').read()
        # train_dataset, val_dataset = torch.utils.data.random_split(self.data, [int(len(self.data) * 0.8), len(self.data) - int(len(self.data) * 0.8)])
        # if train:
        #     self.dataset = train_dataset
        # else:
        #     self.dataset = val_dataset
    
    def __getitem__(self, idx):
        # idx = torch.randint(0, data_size - block_size - 1, (batch_size,))
        data = torch.tensor(
            [encode(self.dataset[i : i + block_size]) for i in idx], device=device
        ) # B x T 
        targets = torch.tensor(
            [encode(self.dataset[i + 1 : i + block_size + 1]) for i in idx], device=device
        ) # B x T 
        return data, targets
    
    def __len__(self):
        return len(self.dataset)

In [12]:
dataset = ShakespeareDataset(data_dir)
sampler = DistributedSampler(dataset, num_replicas=1, rank=0)
dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)

In [13]:
class SelfAttention(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.emb_size = emb_size
        self.head_size = head_size
        self.q = nn.Linear(emb_size, self.head_size, device=device)
        self.k = nn.Linear(emb_size, self.head_size, device=device)
        self.v = nn.Linear(emb_size, self.head_size, device=device)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        q = self.q(x) # B, T, C -> B, T, H
        k = self.k(x)
        v = self.v(x)
        B, T, H = q.shape
        wei = q @ k.transpose(-1, -2) / np.sqrt(self.head_size) # B, T, H @ B, H, T -> B, T, T
        # print(wei.shape)
        mask = torch.tril(torch.ones(B, T, T)).to(device)
        wei = wei.masked_fill(mask == 0, float('-inf'))
        wei = nn.functional.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        out = wei @ v # B, T, H  
        return out

In [14]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, n_heads, emb_size):
        super().__init__()
        self.n_heads = n_heads
        self.emb_size = emb_size
        self.head_size = emb_size // n_heads
        self.linear = nn.Sequential(nn.Linear(emb_size,4 * emb_size), nn.ReLU(), nn.Linear(4 * emb_size, emb_size), nn.Dropout(0.2),)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = []
        for i in range(self.n_heads):
            att_head = SelfAttention(self.head_size)
            out.append(att_head(x))
        # print(len(out), out[0].shape)
        logits = torch.cat(out, dim=-1)
        logits = self.linear(logits)
        return logits


In [15]:
class FeedForwardBlock(nn.Module):
    def __init__(self, num_blocks, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.mha = MultiHeadedAttention(num_heads, emb_size)
        self.ff_net = nn.Sequential(
            nn.Linear(emb_size, emb_size * 4),
            nn.ReLU(),
            nn.Linear(emb_size * 4, emb_size),
            nn.Dropout(dropout)
        )
        self.layer_norm_1 = nn.LayerNorm(emb_size)
        self.layer_norm_2 = nn.LayerNorm(emb_size)

    def forward(self, x):
        x = x + self.mha(self.layer_norm_1(x)) # B, T, C
        x = x + self.ff_net(self.layer_norm_2(x)) # B, T, vocab_size
        return x
        

In [38]:
class GPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_size, emb_size, device=device)
        self.pos_emb_table = nn.Embedding(block_size, emb_size, device=device)
        # self.ff_net = nn.Linear(emb_size, emb_size, device=device)
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.head_size = head_size
        # self.mha = MultiHeadedAttention(self.num_heads)
        # self.layer_norm = nn.LayerNorm(emb_size, device=device, dtype=torch.float32)
        self.final_ll = nn.Linear(emb_size, vocab_size, device=device)
        self.blocks = nn.Sequential(
            *[FeedForwardBlock(num_blocks, num_heads) for i in range(num_blocks)]
        )
        self.optimizer = torch.optim.AdamW(self.parameters(), lr=1e-4, betas=(0.9, 0.95))

        dataset = ShakespeareDataset(data_dir)
        sampler = DistributedSampler(dataset, num_replicas=1, rank=0)
        self.dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)

    def forward(self, x, targets=None):
        token_emb = self.token_emb_table(x) # B, T, C
        # print(x.shape, token_emb.shape)
        pos_emb = self.pos_emb_table(torch.arange(x.shape[-1], device=device)) # T, C
        x = token_emb + pos_emb # B, T, C
#         for _ in range(num_blocks):
#             x_res = x
#             x = self.layer_norm(x)
#             x = self.mha(x) # B, T, C
# #             x = x_res + x
# #             x_res = x
#             x = x_res + self.ff_net(x) # B, T, vocab_size
        x = self.blocks(x)
        logits = self.final_ll(x)
        # print(logits.shape)
        B, T, C = logits.shape
        if targets is not None:
            loss_fn = torch.nn.CrossEntropyLoss()
            # targets = self.token_emb_table(targets)
            loss = loss_fn(logits.view(B*T, C), targets.view(B*T))
        else:
            loss = None
        
        return logits, loss
    
    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            idx_slice = idx[:, -block_size:]
            logits, loss = self.forward(idx_slice)
            logits = logits[:, -1, :]
            probabs = nn.functional.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probabs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
            
        return decode(idx[0].tolist())


    def train(self, num_steps, batch_size):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-4, betas=(0.9, 0.95))
        loss_ar = []
        for step in range(num_steps):
            optimizer.zero_grad()
            data, targets = generate_batch(batch_size, block_size)
            logits, loss = self.forward(data, targets)
            loss_ar.append(loss.item())
            loss.backward()
            optimizer.step()
            if (step+1) % 10 == 0:
                print(f"Step {step}, loss {loss.item()}")
        return loss_ar
    
    def train_sd(self, num_steps, batch_size):
        loss_ar = []
        for epoch in range(num_steps):
            for step, (data, targets) in enumerate(self.dataloader):
                self.optimizer.zero_grad()
                logits, loss = self.forward(data, targets)
                loss_ar.append(loss.item())
                loss.backward()
                self.optimizer.step()
                if (step+1) % 10 == 0:
                    print(f"Step {step}, loss {loss.item()}")
        return loss_ar


In [39]:
gpt = GPT(vocab_size).to(device)

In [40]:
np.sum([p.numel() for p in gpt.parameters()])

6593

In [44]:
logits, loss = gpt(data, targets)
generated = gpt.generate(torch.zeros((1,1), dtype=torch.long, device=device), max_tokens=60)

In [45]:
print(generated)



RRMtFBZ,SEwyz!,Ku?!zIaHMvZXFFoahUZTazJORskOpdBZ!upRsUb$iUlmq


In [43]:
loss_ar = gpt.train(10, batch_size=batch_size)

Step 9, loss 4.643728256225586


In [None]:
import matplotlib.pyplot as plt
plt.colorbar(plt.imshow(logits[0].detach().cpu()))

In [30]:
class GPTLightning(pl.LightningModule):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_size, emb_size)
        self.pos_emb_table = nn.Embedding(block_size, emb_size)
        self.vocab_size = vocab_size
        self.num_heads = num_heads
        self.head_size = head_size
        self.final_ll = nn.Linear(emb_size, vocab_size)
        self.blocks = nn.Sequential(
            *[FeedForwardBlock(num_blocks, num_heads) for i in range(num_blocks)]
        )

    def forward(self, x, targets=None):
        token_emb = self.token_emb_table(x) # B, T, C
        # print(x.shape, token_emb.shape)
        pos_emb = self.pos_emb_table(torch.arange(x.shape[-1])) # T, C
        x = token_emb + pos_emb # B, T, C
        x = self.blocks(x)
        logits = self.final_ll(x)
        # print(logits.shape)
        B, T, C = logits.shape
        if targets is not None:
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(B*T, C), targets.view(B*T))
        else:
            loss = None
        
        return logits, loss
    
    def generate(self, max_tokens, idx=0):
        # idx = encode(idx)
        for _ in range(max_tokens):
            idx_slice = idx[:, -block_size:]
            logits, loss = self.forward(idx_slice)
            logits = logits[:, -1, :]
            probabs = nn.functional.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probabs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return decode(idx[0].tolist())

    def training_step(self, batch, batch_idx):
        # loss_ar = []
        data, targets = batch
        data = encode(data)
        targets = encode(targets)
        logits, loss = self.forward(data, targets)
        # loss_ar.append(loss.item())
        if (batch_idx+1) % 10 == 0:
            print(f"Batch_idx {batch_idx}: loss: {loss.item()}")
        return loss
    
    def validation_step(self, batch, batch_idx):
        # loss_ar = []
        data, targets = batch
        data = encode(data)
        targets = encode(targets)
        logits, loss = self.forward(data, targets)
        # loss_ar.append(loss.item())
        if (batch_idx+1) % 10 == 0:
            print(f"Batch_idx {batch_idx}: loss: {loss.item()}")
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-4, betas=(0.9, 0.95))
        return optimizer


In [46]:
class ShakespeareDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, train=True):
        super().__init__()
        self.data = open(data_dir, "r").read()
        train_dataset, val_dataset = torch.utils.data.random_split(self.data, [int(len(self.data) * 0.8), len(self.data) - int(len(self.data) * 0.8)])
        if train:
            self.dataset = train_dataset
        else:
            self.dataset = val_dataset

    def __getitem__(self, idx):
        # idx = torch.randint(0, data_size - block_size - 1, (batch_size,))
        data = torch.tensor(
            [self.dataset[i : i + block_size] for i in idx]
        )  # B x T
        targets = torch.tensor(
            [self.dataset[i + 1 : i + block_size + 1] for i in idx]
        )  # B x T
        return data, targets

    def __len__(self):
        return len(self.dataset)
        
class ShakespeareDataModule(pl.LightningDataModule):
    def __init__(self, data_dir, batch_size, block_size):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.block_size = block_size
        self.dataset = None
    
    def prepare_data(self):
        pass
    
    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            self.train_dataset = ShakespeareDataset(self.data_dir, train=True)
            self.val_dataset = ShakespeareDataset(self.data_dir, train=False)
    
    def train_dataloader(self):
        assert self.train_dataset is not None, "Train Dataset is None"
        train_loader =  DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=1,
        )
        return train_loader
    
    def val_dataloader(self):
        assert self.val_dataset is not None, "Valid Dataset is None"
        val_loader =  DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=1,
        )
        return val_loader

In [48]:
gpt_l = GPTLightning(vocab_size)
shakespeare_dm = ShakespeareDataModule(data_dir=data_dir, batch_size=batch_size, block_size=block_size)
shakespeare_dm.setup(stage="fit")

trainer = Trainer(accelerator="auto", max_epochs=1, devices=1, )
trainer.fit(gpt_l, train_dataloaders=shakespeare_dm.train_dataloader(), val_dataloaders=shakespeare_dm.val_dataloader())

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name            | Type       | Params
-----------------------------------------------
0 | token_emb_table | Embedding  | 1.0 K 
1 | pos_emb_table   | Embedding  | 128   
2 | final_ll        | Linear     | 1.1 K 
3 | blocks          | Sequential | 4.3 K 
-----------------------------------------------
6.6 K     Trainable params
0         Non-trainable params
6.6 K     Total params
0.026     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'ShakespeareDataset' on <module '__main__' (built-in)>
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
