In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [4]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/shakespeare_data/input.txt

In [5]:
device = torch.device("cpu")
# if torch.backends.mps.is_available() and torch.backends.mps.is_built():
#     device = torch.device("mps")

device

device(type='cpu')

In [6]:
txt = open('shakespeare_data/input.txt', 'r').read()
len(txt)

1115394

In [7]:
txt[0:500]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor"

In [8]:
chars = list(set(txt))
chars.sort()

ctoi = {c:i for i, c in enumerate(chars)}
itoc = {i:c for i, c in enumerate(chars)}
vocab_size = len(chars)

print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [9]:
i =  math.floor(0.9 * len(txt))
train_txt = txt[0:i]
valid_txt = txt[i+1:]

len(train_txt), len(valid_txt)

(1003854, 111539)

In [10]:
train_tkns = [ctoi[c] for c in train_txt]
valid_tkns = [ctoi[c] for c in valid_txt]

In [11]:
from numpy.random import randint
block_size = 64

def txt_to_token(t):
    return [ctoi[c] for c in t]


# (B, L)
def random_batch(split="train"):
    data = train_tkns if split == "train" else valid_tkns
    
    i = randint(0, len(data)-block_size-1)
    x = torch.tensor(data[i:i+block_size], device=device)
    y = torch.tensor(data[i+1:i+block_size+1], device=device)
    
    return x, y

x, y = random_batch("train")
x.shape

torch.Size([64])

In [12]:
@torch.no_grad()
def estimate_loss(model,n_iter=10):
    model.eval()
    losses = []

    for split in ["train", "valid"]:   
        loss=0
        for _ in range(n_iter):     
            x, y = random_batch(split)
            logits = model(x) # (L, C)
            #L, C = logits.shape
            loss+= F.cross_entropy(logits, y)
        losses.append(loss.item()/n_iter)

    model.train()
    return losses

In [13]:
@torch.no_grad()
def sample(model):
    model.eval()

    max_len = 500
    tks = [0]*block_size

    for i in range(max_len):
        ctx = torch.tensor(tks[i:i+block_size]) # (L)
        ctx = ctx.view(-1) # (L)

        logits = model(ctx) # (L, C)
        probs = F.softmax(logits, dim=-1) # (L, C)
        probs = probs[-1,:] # (C), # the last in the sequence is the newly generated
        yi = torch.multinomial(probs, 1)
        tks.append(yi.item())

    tks = tks[block_size:]
    chars = [itoc[t] for t in tks]
    model.train()
    return "".join(chars)

In [14]:
# return (L, C)
def pos_encoding(x):
    L, C = x.shape
    pos = torch.arange(0, L).view(-1, 1) # (L, 1)
    div = 2 * torch.arange(0, C) / C # (C)
    div = torch.pow(10000, div) # (C)
    e = pos / div
    pe = torch.zeros(L, C)
    pe[:,0::2] = torch.sin(e[:,0::2])
    pe[:,1::2] = torch.cos(e[:,1::2])
    
    pe = pe.to(device)
    return pe

In [15]:
from src.encoder import rotary_encoding

class MultiHeadAttension(nn.Module):    
    
    def __init__(self, head_num, head_size, in_size, out_size):
        super().__init__()
        
        self.head_size = head_size
        self.head_num = head_num        
        self.attn = nn.Linear(in_size, 3 * head_num * head_size, bias=False)
        self.ffn = nn.Linear(head_num * head_size, out_size, bias=False)
        self.rotary_encoding = rotary_encoding

        
    # x: (L, C)  
    # return: (L, C')
    def forward(self, x):
        L, C = x.shape
        
        z = self.attn(x) # (L, 3 * hn * hs)
        k, q, v = torch.split(z, self.head_num * self.head_size, dim=-1) # (L, hn * hs)

        # reshape the output to have the correct shape
        q = q.view(L, self.head_num, self.head_size)
        k = k.view(L, self.head_num, self.head_size)
        v = v.view(L, self.head_num, self.head_size)

        # apply rotary encoding if needed
        if self.rotary_encoding:
            q = rotary_encoding(q)
            k = rotary_encoding(k)

        
        q=q.permute(1,0,2) # ( hn, L, hs)
        k=k.permute(1,0,2)
        v=v.permute(1,0,2)

        
        q = q.permute(0, 2, 1) # ( hn, hs, L)
        attn = (k @ q) / self.head_size**0.5 # (hn, L, L)
        mask = torch.tril(torch.ones(L, L)) == 0
        mask = mask.to(device)
        attn = attn.masked_fill(mask, -float('inf')) # (B, hn, L, L)
        attn = F.softmax(attn, dim=-1)
        
        y = attn @ v # (hn, L, hs)
        y = y.permute(1, 0, 2) # (L, hn, hs)
        y = y.contiguous().view(L, -1) # (L, hn * hs)
        y = self.ffn(y) # (L, C)
        
        return y 
    

In [16]:
from src.GPT2 import NewGELUActivation


class MLP(nn.Module):
    def __init__(self, d_Embedding=768, intermediate_size=3072, dropout=0.0, device='cpu'):
        super().__init__()
        
        self.feedforward1=nn.Linear(d_Embedding, intermediate_size, device=device)
        self.feedforward2=nn.Linear(intermediate_size, d_Embedding, device=device)

        self.dropout=nn.Dropout(dropout)
        self.activation=NewGELUActivation()

        self.n_parameters=2*d_Embedding*intermediate_size

    def forward(self,x):
        x=self.feedforward1(x)
        x=self.activation(x)
        x=self.feedforward2(x)
        x=self.dropout(x)

        return x

In [17]:
from src.attention import AttentionMessage
from src.graph_initialization import linear_unidirectional_graph_maker
from src.transformerMP import make_QKV, aggregate_heads

edge_index = linear_unidirectional_graph_maker(64)(64)

class Block(nn.Module):    
    
    def __init__(self, emb_size, head_size):
        super().__init__()
        
        assert emb_size % head_size == 0
        head_num = emb_size // head_size
        
        self.QKV=make_QKV(emb_size,head_size,head_size,head_num,True)
        self.mha = AttentionMessage()
        self.lin=aggregate_heads(head_size,emb_size,head_num)
        self.lnorm1 = nn.LayerNorm(emb_size)
        self.lnorm2 = nn.LayerNorm(emb_size)
        self.ffn = MLP(emb_size, 2*emb_size)
        
        
    # x: (B, L, emb)
    def forward(self, x):
        Q,K,V = self.QKV(x)
        y,_ = self.mha(Q,K,V,edge_index)
        y = self.lin(y) + x
        y = self.lnorm1(y)
        y = self.ffn(y) + y
        y = self.lnorm2(y)
        return y
    
# x = torch.randn(3, 4, 10)
# b = Block(10, 2)
# b(x)

In [21]:
emb_size = 128
head_size = 32

from src.decoder import Decoder
from src.encoder import Encoder
from src.GPT2 import GPT2_Block
from src.tokenizer import CharTokenizer
from src.graphAN import BlockGenerator

tokenizer = CharTokenizer("shakespeare_data/input.txt")

GPT2_block_generator=BlockGenerator(GPT2_Block, emb_size, head_size, head_size, emb_size//head_size, rotary_encoding=True)

encoder= Encoder(emb_size, tokenizer)
decoder = Decoder(encoder)

class Transformer(nn.Module):    
    
    def __init__(self):
        super().__init__()
        self.embed = encoder
        self.blocks = nn.Sequential(Block(emb_size, head_size),Block(emb_size, head_size))
        #self.blocks = nn.ModuleList([GPT2_block_generator() for _ in range(2)])
        self.linear = decoder

    # (L) -> (L, C)
    def forward(self, x):
        y = self.embed(x) # (L, emb)
        
        if isinstance(self.blocks, nn.Sequential):
            y = self.blocks(y) # (L, emb)
        if isinstance(self.blocks, nn.ModuleList):
            for block in self.blocks:
                y = block(y,edge_index)
                
        y = self.linear(y) # (L, vocab)
        
        return y

length of dataset in characters: 1,115,394


In [22]:
%%timeit

test_block=GPT2_block_generator()

test_block(torch.randn(64,emb_size),edge_index).shape

8.56 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [32]:
from src.graphAN import GraphAttentionNetwork


#model = Transformer()

model = GraphAttentionNetwork(tokenizer, encoder, GPT2_block_generator,decoder, 2)


def forward(x):
    y = model.encoder(x)  # (L, emb)

    if isinstance(model.transformer_blocks, nn.ModuleList):
        for block in model.transformer_blocks:
            y = block(y, edge_index)

    y = model.decoder(y)  # (L, vocab)

    return y

model.forward=forward

model = model.to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-4)

count = sum([p.numel() for p in model.parameters()])
print(f"total parameter: {count}")

total parameter: 1720704


In [33]:
estimate_loss(model)

[124.5640625, 122.69783935546874]

In [34]:

epoch = 30000
eval_interval = 500
eval_size = 500
lossi = []

model.train()

for i in range(epoch):
    if i % eval_interval == 0 or i == epoch-1:
        tr, va = estimate_loss(model)
        lossi.append((tr, va))
        print(f"{i*100/epoch:.2f}%: {tr:.4f}  {va:.4f}")
        
    optim.zero_grad()

    xb, yb = random_batch()
    logits = model(xb) # (L, C)

    L, C = logits.shape
    loss = F.cross_entropy(logits, yb)
    loss.backward()
    optim.step()

0.00%: 124.0610  122.8956
1.67%: 3.2421  3.6870
3.33%: 3.0918  3.0338
5.00%: 2.8880  3.0844
6.67%: 2.6386  2.6630
8.33%: 2.3665  2.5378
10.00%: 2.5162  2.5735
11.67%: 2.3311  2.4179
13.33%: 2.3051  2.2881
15.00%: 2.3542  2.4926
16.67%: 2.3772  2.3742
18.33%: 2.0202  2.4097


KeyboardInterrupt: 

In [None]:
tr_loss, va_loss = estimate_loss(model)

print(f"train: {tr_loss:.4f}")
print(f"valid: {va_loss:.4f}")

train: 1.8386
valid: 2.0923


In [None]:
print(sample(model))

O I se pirverce me comle come, foolforteff elf
orems hereishe other rithey most
I'll fores.
By for bet the thone bot She deent well's,
The unselung mence shall orsegash of Engueves of the creop becencumel;
More hole
where. Sore gring, you?
Te he have sulle hone husred ing the come of cresfied of shis refore thee thou: and hem,
Thoun hime stily fort the freed
DIVIV:
Ay, inlight Be pon hourse siffors, full honou soeinth word she more in the swow.

LAKE ENCAUMERCER:
To to whence.


VORD BOLINGBUKET


## Log

- Bi-gram: 2.4716, 2.4755
- Single-head attention: 2.3899, 2.4041
- Multi-head attention, single layer: 2.0820, 2.1165
- Multi-head attention, single layer, positional encoding: 1.8575, 1.9216
- 2-layer transformer (with everything, MHA, positional encoding, layer norm): 1.7155, 1.7952