In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader


In [2]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 6
eval_interval = 100
learning_rate = 2e-3
weight_decay = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.2
# ------------
torch.manual_seed(1337)

<torch._C.Generator at 0x7f2bb8db9990>

In [3]:
class Data():
    def __init__(self, filepath):
        with open(filepath, mode ="r", encoding="utf-8") as f:
            self.text = f.read()
            self.unique_chars = sorted(list(set(self.text)))
        self.vocab_size = len(self.unique_chars)
        self.mapping = self.generate_char_mappings(self.unique_chars)
    
    def generate_char_mappings(self, uq):
        char_to_idx = {c : i for i, c in enumerate(uq)}
        idx_to_char = {i : c for i, c in enumerate(uq)}
        return {'char_to_idx': char_to_idx, 'idx_to_char': idx_to_char}
        
    def convert_seq_to_indices(self, seq):
        return [self.mapping['char_to_idx'][char] for char in seq]

    def convert_indices_to_seq(self, seq):
        return "".join([self.mapping['idx_to_char'][idx] for idx in seq])

In [4]:
# Create a custom dataset class
class TransformerDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, index):
        # Grab a chunk of data with block_size + 1 length
        chunk = self.data[index:index + self.block_size + 1]
        # Input sequence (x) is all but the last character in the chunk
        x = chunk[:-1]
        # Target sequence (y) is all but the first character in the chunk, shifted by one
        y = chunk[1:]
        return x, y

# Load the necessary data
class Dataloader():
    def __init__(self, Data):
        self.chars = Data.unique_chars
        self.vocab_size = Data.vocab_size
        self.stoi = Data.mapping['char_to_idx']
        self.itos = Data.mapping['idx_to_char']

        # create a mapping from characters to integers
        self.encode = lambda s: Data.convert_seq_to_indices(s) # encoder: take a string, output a list of integers
        self.decode = lambda l: Data.convert_indices_to_seq(l) # decoder: take a list of integers, output a string
        self.data = torch.tensor(self.encode(Data.text))    
        self.train_split, self.valid_split, self.test_split = self.create_data_split()
        

    def create_data_split(self, train = 0.8, valid = 0.1, test = 0.1):
        # Have to ensure total split sums up to 1
        assert train + valid + test == 1, "Total Split Must Sum Up to 1"
        length_data = len(self.data)

        train_n = int(train * length_data)
        valid_n = int(valid * length_data)
        test_n = int(test * length_data)

        # Ensure that each data split has at least one sample
        assert train_n > 0, "Training split has zero elements"
        assert valid_n > 0, "Validation split has zero elements"
        assert test_n > 0, "Test split has zero elements"

        # Create splits
        train_data = self.data[ : train_n]
        valid_data = self.data[train_n : train_n + valid_n]
        test_data = self.data[train_n + valid_n :] 

        # Return splits
        return train_data, valid_data, test_data
    
    def build_loader(self, split):
        if split == "train":
            dataset = TransformerDataset(self.train_split, block_size)
        elif split == "valid":
            dataset = TransformerDataset(self.valid_split, block_size)
        else:
            dataset = TransformerDataset(self.test_split, block_size)
        
        shuffle = split == "train"
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle= shuffle)

        return dataloader
        

torch.Size([32, 32, 64])
torch.Size([32, 32])
32


In [5]:
# A head of decoder self-attention
class AttentionHead(nn.Module):
    def __init__(self, head_size): 
        super().__init__()
        self.head_size = head_size
        self.query = nn.Linear(n_embd, self.head_size)
        self.key = nn.Linear(n_embd, self.head_size)
        self.value = nn.Linear(n_embd, self.head_size)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask = True):
        k = self.key(x) # B, T, C (head_size)
        q = self.query(x) # B, T, C (head_size)
        attn = q @ k.transpose(-2, -1) / math.sqrt(self.head_size) # Following the actual implementation for Attention # (B, T, C) * (B, C, T) => (B, T, T)
        # Decoder Only Model, meaning we can't see into the future, hence masking is needed prior to softmax operation
        if mask == True:
            attn = attn.masked_fill(self.tril[:x.shape[-2], :x.shape[-2]] == 0, float('-inf'))
        attn = F.softmax(attn, dim = -1) # Apply softmax
        attn = self.dropout(attn)
        v = self.value(x) # B, T, C
        attn = attn @ v # B, T, T @ B, T, C
        return attn
    

# A head of Masked Multi Head Attention
class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x, mask = True) for h in self.heads], dim = -1) # the channel dimension will equal to num of embedding
        out = self.dropout(self.proj(out))
        return out

# Feedforward layer
class FeedFoward(nn.Module):
    def __init__(self,):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

# Decoder-only Block
class DecoderBlock(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MaskedMultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # residual + pre - norm layer
        x = x + self.ffwd(self.ln2(x))
        return x

Intuitions and key notes:
## Self-Attention:
#### High Level Overview:
Consisting of 3 main keys(no pun intended):
- Key: Here it mainly focuses on the position of the words in the sequence
- Query: Acts as a Scanner or seeker that focuses on specific feature / aspect of a sequence, in terms of langauge, on a word level, it can be thought of scanning for high level concepts
- Value: Actually cntain the actual information and data to be attended to and combined based on the relevance score computed from the keys and queries
Essentially this allows the self-attention block to capture long term dependencies and relationships within the input 
#### Mathematical Intuition:
In pytorch, these key, query, value can be thought of as a linear layer, normally set to having the same dimension, normally the dimensions for these layers are the division of the number of embedding over the number of head, . First we do a matrix multiplication of the query and key to find out the relevance score, afterwards it is often scaled down by square root of the head size (calculated by the embedding dimension / number of head). Why should it be scaled down? Because often times the result of the dot product would yield us with values too big / too small and having that as an input to a softmax function would essentially led to having high variance, resulting in vanishing / exploding gradients and the saturation of the softmax function(leading to very big / small attention weights, making it hard to learn meaningful patterns) Afterwards we multiply it with the value
Regarfding masking:
In the case of a decoder transformer, that means the input sequence at a specific timestep can't look into the future, so we do it by masking and replacing the future values by -infinity, since we are gonna softmax the value, otherwise encoder transformer will look into every input at every timestep

## Masked Multi Head Attention:
#### High Level Overview:
Can be thought of as a module or a list of attention blocks, where each attention block represents a separate "head" (Each head has a separate set of values). For each attention block, attention computation is performed independently, meaning you can uncover underlying patterns and whatnot. The attended values for each head are then concatenated across the head size dimension (stacked together), and then a final projection layer is applied to produce the output of the multi-head attention layer

## Residual Connection:
#### High Level Overview:
Can be thought of as the "skip-connection" that allows inputs to traverse deep within the network, this ensures that during the backpropagation, the gradients can flow through 2 parts, namely the layer itself and the skip connection

## Dropout:
#### High Level Overview:
Regularization technique used during training of model (they are automatically deactivated during inference / evaluation / testing), what it does is it deactivates some of the neurons(units) in a layer by setting them to zero depending on the hyperparameter specified. The deactivation of random neurons introduce noise in the network, which acts as regularizer and helps prevent the model from overfitting to the data

## LayerNormalization:
#### High Level Overview:
For consistent scale and distribution, improve training process and prevent issues like vanishing / exploding gradients. In layer normalization, you essentially normalize each row separately in the batch (across dim = 1 if it's 2d). So for each row, the same mean / standard deviation is used to normalize the row feature/activation value

## Feed Forward Layer:
#### High Level Overview:
Essentially just a dense linear layer stacked on top of each other, with activation layers like relu,tanh in betweem to introduce non-linearities, finished off with a dropout layer.

## Decoder Block:
#### High Level Overview:
You put everything together based on the concepts outlined above:
input -> layer norm -> multi-head attention -> residual -> layernorm -> feedforward -> residual -> output

In [6]:
# Decoder Only Transformer
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, data):
        super().__init__()
        self.token_embedding_table = nn.Embedding(data.vocab_size, n_embd) 
        self.position_embedding_table = nn.Embedding(block_size, n_embd) #Positional Embedding, depends on block size
        self.blocks = nn.Sequential(*[DecoderBlock(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, data.vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of ints
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C), generate integers from 0 to T-1
        x = tok_emb + pos_emb # (B,T,C) Adding token embedding with position embedding
        x = self.blocks(x) # (B,T,C), Layers of masked multi head attention # C = n_embeds
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens, preventing segfault
            idx_cond = idx[:, -block_size:]
            # predictions, do forward
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx




In [7]:

def training_loop(model, dataLoader, optimizer):
    train_loader = dataLoader.build_loader("train")
    valid_loader = dataLoader.build_loader("valid")

    # Training Phase
    losses = []
    valids = []
    model.train()
    
    for i in range(max_iters):
        loss_i = 0
        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device)

            # Forward Pass
            logits, loss = model(x, y)
            loss_i += loss.item()
            # Backward Pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        avg_train_loss = loss_i / len(train_loader)
        losses.append(avg_train_loss)
        print(f'Epoch {i} training complete, Average Training Loss: {avg_train_loss}')

        # Validation Phase
        model.eval()
        valid_i = 0
        with torch.no_grad():
            for x_v, y_v in valid_loader:
                x_v = x_v.to(device)
                y_v = y_v.to(device)
                logits, loss = model(x_v, y_v)
                valid_i += loss.item()
            avg_valid_loss = valid_i / len(valid_loader)
            valids.append(avg_valid_loss)
            print(f'Iteration {i}, Average validation loss: {avg_valid_loss}')
        context = torch.zeros((1, 1), dtype=torch.long, device=device)
        print(dataLoader.decode(model.generate(context, max_new_tokens=100)[0].tolist()))
    return losses, valids




tensor([[14, 23, 12,  ..., 44, 57, 44],
        [57,  1, 62,  ..., 41, 51, 44],
        [ 0,  0, 17,  ..., 47,  5,  1],
        ...,
        [45, 54, 57,  ..., 43,  1, 46],
        [40, 59,  1,  ...,  1, 40, 57],
        [54, 52, 44,  ..., 59, 47, 44]])
tensor([[23, 12, 32,  ..., 57, 44,  1],
        [ 1, 62, 48,  ..., 51, 44, 43],
        [ 0, 17, 16,  ...,  5,  1, 41],
        ...,
        [54, 57,  1,  ...,  1, 46, 48],
        [59,  1, 40,  ..., 40, 57, 44],
        [52, 44,  1,  ..., 47, 44, 64]])


NameError: name 'vocab_size' is not defined

In [11]:
data = Data('/kaggle/input/text-data/data/shakespeare.txt')
dataLoader = Dataloader(data)
model = DecoderOnlyTransformer(data)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = weight_decay)
l, v = training_loop(model, dataLoader, optimizer)
    
torch.save(model.state_dict(), 'transformer_model.pth')
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(dataLoader.decode(model.generate(context, max_new_tokens=2000)[0].tolist()))

95027