In [1]:
import os
import torch
import tiktoken
from glob import glob
import torch.nn as nn
from torch.nn import functional as F, Embedding
from torch.utils.data import Dataset, DataLoader

In [2]:
class Config:
    device = "cuda:0"
    n_vocab = 50257
    d_model = 768
    max_length = 256
    stride = 256
    qkv_bias = False
    embedding_dropout = 0
    num_head = 16
    multihead_attention_dropout = 0
    num_of_stacked_transformer_blocks = 24

In [3]:
text_path = glob("/kaggle/input/text-data"+"/**.txt")[0]

with open(text_path, "r") as text_file:
    text_file_read = text_file.read()

In [4]:
tokenizer = tiktoken.get_encoding(encoding_name="gpt2")

In [5]:
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.X_out, self.y_out = [], []
        self.token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        assert len(self.token_ids)>max_length, f"Length of Token IDs {len(self.token_ids)} not greater than Max length {max_length}"

        for idx in range(0, len(self.token_ids)-max_length, stride):
            X, y = self.token_ids[idx:idx+max_length], self.token_ids[idx+1:idx+max_length+1]
            self.X_out.append(X)
            self.y_out.append(y)
        
    def __len__(self):
        return len(self.X_out)

    def __getitem__(self, idx):
        return torch.tensor(self.X_out[idx]), torch.tensor(self.y_out[idx])

def CreateDataLoader(data, tokenizer, cfg, batch_size, shuffle, drop_last, num_workers):
    dataset = GPTDataset(text=data, tokenizer=tokenizer, max_length=cfg.max_length, stride=cfg.stride)
    data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return data_loader

In [6]:
train_size = .9
total_tokens = len(text_file_read)
train_ratio = int(train_size * total_tokens)
train_text = text_file_read[:train_ratio]
val_text = text_file_read[train_ratio:]

print("Train Length:", len(train_text), ", Val Length:", len(val_text))

Train Length: 18431 , Val Length: 2048


In [7]:
train_dl = CreateDataLoader(train_text, tokenizer, Config, 2, True, True, os.cpu_count())
val_dl = CreateDataLoader(val_text, tokenizer, Config, 2, False, False, os.cpu_count())

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, cfg):
        super(MultiHeadAttention, self).__init__()
        self.cfg = cfg
        assert cfg.d_model % cfg.num_head == 0, "cfg.d_model % cfg.num_head != 0"
        self.d_k = cfg.d_model // cfg.num_head
        # Attentions Projections
        self.W_q = nn.Linear(cfg.d_model, cfg.d_model, bias=cfg.qkv_bias)
        self.W_k = nn.Linear(cfg.d_model, cfg.d_model, bias=cfg.qkv_bias)
        self.W_v = nn.Linear(cfg.d_model, cfg.d_model, bias=cfg.qkv_bias)
        self.W_o = nn.Linear(cfg.d_model, cfg.d_model, bias=cfg.qkv_bias)
        # Dropout
        self.dropout = nn.Dropout(p=cfg.multihead_attention_dropout)

    def forward(self, x):
        batch_size, seq, d_model = x.size()
        q = self.W_q(x).view(batch_size, seq, self.cfg.num_head, self.d_k).transpose(1, 2)
        k = self.W_k(x).view(batch_size, seq, self.cfg.num_head, self.d_k).transpose(1, 2)
        v = self.W_v(x).view(batch_size, seq, self.cfg.num_head, self.d_k).transpose(1, 2)

        mask = torch.tril(torch.ones(seq, seq, device=x.device)).unsqueeze(0).unsqueeze(0)
        attention_score = q @ k.transpose(-2, -1) / torch.sqrt(torch.tensor(self.d_k))
        attention = self.dropout(F.softmax(attention_score.masked_fill(mask==0, -torch.inf), dim=-1)) @ v # dropout(softmax(attention score  + mask)) @ v, dim=(batch_size, num_head, seq_len, d_k)
        output = attention.transpose(1, 2).contiguous().view(batch_size, seq, d_model) # (batch_size, seq, d_model)
        return self.W_o(output) # (batch_size, seq, d_model)

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * (x**3)))
        )

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super(FeedForward, self).__init__()
        self.cfg = cfg
        self.ff = nn.Sequential(
            nn.Linear(self.cfg.d_model, 4*self.cfg.d_model),
            GELU(),
            nn.Linear(4*self.cfg.d_model, self.cfg.d_model)
        )

    def forward(self, x):
        return self.ff(x)

class LayerNormalization(nn.Module):
    def __init__(self, cfg):
        super(LayerNormalization, self).__init__()
        self.eps = 1e-5
        self.Weight = nn.Parameter(torch.ones(cfg.d_model))
        self.bias = nn.Parameter(torch.zeros(cfg.d_model))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True, unbiased=False)
        return self.Weight * (x - mean / torch.sqrt(variance+self.eps)) + self.bias

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super(TransformerBlock, self).__init__()
        self.multihead_attention = MultiHeadAttention(cfg)
        self.layer_norm_1 = LayerNormalization(cfg)
        self.layer_norm_2 = LayerNormalization(cfg)
        self.dropout = nn.Dropout(p=cfg.multihead_attention_dropout)
        self.feedforward = FeedForward(cfg)

    def forward(self, x):
        shortcut = x
        x = self.layer_norm_1(x)
        x = self.multihead_attention(x)
        x = self.dropout(x)
        x = x + shortcut

        shortcut = x
        x = self.layer_norm_2(x)
        x = self.feedforward(x)
        x = self.dropout(x)
        x = x + shortcut
        return x # (batch_size, seq, d_model)

In [9]:
class GptModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        
        # Embeddings
        self.wrd_embeddings = nn.Embedding(cfg.n_vocab, cfg.d_model)
        self.pos_embeddings = nn.Embedding(cfg.max_length, cfg.d_model)
        self.embedding_dropout = nn.Dropout(p=cfg.embedding_dropout)

        # Stacked Multihead Attention
        # nn.Sequential([Block1, Block2, Block3])  # ❌ wrong thats why the * so that it will unpack the list into nn.Sequential(Block1, Block2, Block3)
        self.stack_mha = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg.num_of_stacked_transformer_blocks)])

        # Final layer norm
        self.final_layer_norm = LayerNormalization(cfg)

        # Output layer
        self.output_layer = nn.Linear(cfg.d_model, cfg.n_vocab)

    def forward(self, x):
        batch, seq = x.shape
        wrd_embed = self.wrd_embeddings(x)
        positions = torch.arange(seq, device=x.device).unsqueeze(0).expand(batch, -1) # (seq,) -> (1, seq) -> (batch, seq)
        pos_embed = self.pos_embeddings(positions)
        xout = self.embedding_dropout(wrd_embed + pos_embed)
        xout = self.stack_mha(xout) # (batch_size, seq, d_model)
        xout = self.final_layer_norm(xout) # (batch_size, seq, d_model)
        return self.output_layer(xout) # (batch_size, seq, n_vocabs)

In [10]:
gpt_model = GptModel(cfg=Config)

In [11]:
def GenerateText(model, token_ids, max_new_tokens, context_size, tokenizer):
    for _ in range(max_new_tokens):
        tkn_ids = token_ids[:, -context_size:]
        with torch.no_grad():
            logits = model(tkn_ids) # (batch_size, seq, n_vocabs)
        last_token_pred = F.softmax(logits[:, -1, :], dim=-1) # (batch_size, seq, n_vocabs)
        max_prob = torch.argmax(last_token_pred, dim=-1, keepdim=True) 
        token_ids = torch.cat((token_ids, max_prob), dim=1)
    return tokenizer.decode(token_ids.tolist()[0])

In [12]:
input_txt = "Hello my name is Eddy"
tkn_id = torch.tensor(tokenizer.encode(input_txt)).unsqueeze(0).to("cuda:0")

In [13]:
model_output = GenerateText(model=gpt_model.to("cuda:0"), token_ids=tkn_id, max_new_tokens=50, context_size=50, tokenizer=tokenizer)

model_output

'Hello my name is Eddy $_ $_ Purpose Human NFCukedOrg professionalism $_ cognitionWINDOWS Zhu July chilly unions thermal youngrank frontman $_uits Purposeamia professionalism Fischer Fischer Riverside fins frontman UNDER Fischeruncle chilly AnswersWINDOWS Alph mailbox Purpose brazenrank professionalismOrg orbital Purpose Fischer thermal Honest NarendraemnESA'

In [14]:
class Trainer:
    def __init__(self, model, train_dl, val_dl, optimizer, device):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.device = device
        self.train_dl = train_dl
        self.val_dl = val_dl

    def train(self, train_dl):
        total_loss = 0
        self.model.train()
        for x, y in train_dl:
            self.optimizer.zero_grad()
            x, y = x.to(self.device), y.to(self.device)
            logits = self.model(x) # (batch_size, seq_len, n_vocabs)
            loss = F.cross_entropy(logits.flatten(0, 1), y.flatten()) #logits.flatten(0, 1) -> (batch_size*seq_len, n_vocabs), y.flatten() -> (batch_size*seq_len,)
            loss.backward()
            self.optimizer.step()
            total_loss+=loss.item()
        return total_loss

    def eval(self, val_dl):
        total_loss = 0
        self.model.eval()
        for x, y in val_dl:
            with torch.no_grad():
                x, y = x.to(self.device), y.to(self.device)
                logits = self.model(x)
                loss = F.cross_entropy(logits.flatten(0, 1), y.flatten())
                total_loss+=loss.item()
        return total_loss

    def fit(self, n_epochs):
        display_models_output = 5
        for epoch in range(n_epochs):
            train_loss = self.train(self.train_dl)
            eval_loss = self.eval(self.val_dl)
    
            print(f"Epoch {epoch+1}/{n_epochs} -- Train loss: {train_loss:.4f} -- Val loss: {eval_loss:.4f}")
    
            if (epoch + 1) % display_models_output == 0:
                input_txt = "I HAD always thought Jack Gisburn rather"
                tkn_id = torch.tensor(tokenizer.encode(input_txt)).unsqueeze(0).to(self.device)
                output = GenerateText(
                    model=self.model,
                    token_ids=tkn_id,
                    max_new_tokens=30,
                    context_size=30,
                    tokenizer=tokenizer
                )
                print(output)

In [15]:
optimizer = torch.optim.AdamW(params=gpt_model.parameters(), lr=4e-5, weight_decay=.1)
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [16]:
trainer = Trainer(model=gpt_model, train_dl=train_dl, val_dl=val_dl, optimizer=optimizer, device=device)

In [17]:
trainer.fit(n_epochs=100)

Epoch 1/100 -- Train loss: 114.9098 -- Val loss: 11.1948
Epoch 2/100 -- Train loss: 94.0959 -- Val loss: 10.6217
Epoch 3/100 -- Train loss: 78.1829 -- Val loss: 9.8181
Epoch 4/100 -- Train loss: 64.6679 -- Val loss: 8.9218
Epoch 5/100 -- Train loss: 51.7605 -- Val loss: 8.3731
I HAD always thought Jack Gisburn rather think so Mrs- his pictures isI- not- inrah- inrah- in his pictures it and my that, and I asI I
Epoch 6/100 -- Train loss: 41.1645 -- Val loss: 8.1469
Epoch 7/100 -- Train loss: 30.3654 -- Val loss: 7.9971
Epoch 8/100 -- Train loss: 21.6356 -- Val loss: 8.0106
Epoch 9/100 -- Train loss: 14.7735 -- Val loss: 7.7961
Epoch 10/100 -- Train loss: 9.5650 -- Val loss: 7.9010
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that he was to put it to those such donkey
Epoch 11/100 -- Train loss: 4.8632 -- Val loss: 7.9052
Epoch 12/100 -- Train loss: 2.1169 -- Val loss: 7.9797
Epoch 13/100 -- Train loss: 0.