# LLMs-ZERO-to-HERO: NANO_GPT

### I. Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from dataclasses import dataclass
import math

torch.manual_seed(1024)

<torch._C.Generator at 0x1bc3bca7470>

### II. Define the hyperparameters of GPT

In [2]:
@dataclass
class GPTConfig: 
    block_size: int = 512 # max sequence
    batch_size: int = 12
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768   # hidden_dim, hidden_size; same as emb_szie for tie_embedding_weight
    hidden_dim: int = n_embd
    dropout: float = 0.1
    head_size: int = n_embd // n_head
    # vocab_size
    # gpt2 official tokenizer
    vocab_size: int = 50257

### III. Define the structure of GPT

In [3]:
# 1. single head attention
class SingleHeadAttention(nn.Module): 
    def __init__(self, config): 
        super().__init__()
        self.key = nn.Linear(config.n_embd, config.head_size)
        self.value = nn.Linear(config.hidden_dim, config.head_size)
        self.query = nn.Linear(config.hidden_dim, config.head_size)
        self.head_size = config.head_size


        # use register_buffer to register attention_mask
        # it can save the memory and RAM, as no need to calculate the ** gradients ** 
        self.register_buffer(
            "attention_mask",
            # "tril" means lower triangle
            # block_size is 512
            torch.tril(
                torch.ones(config.block_size, config.block_size)
            )
        )
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x): 
        batch_size, seq_len, hidden_dim = x.size()
        # using the head size to calculate the similarity
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        weight = q @ k.transpose(-2, -1)    # @ is short for torch.matmul
        weight = weight.masked_fill(
            self.attention_mask[:seq_len, :seq_len] == 0,
            float('-inf')
        )
        # divide sqrt(d_k) when calcualting weight
        weight = F.softmax(weight, dim=-1) / math.sqrt(self.head_size)

        # dropout need to be after weight
        # since dropout will drop the attention weight
        weight = self.dropout(weight)
        output = weight @ v
        return output
    
# 2. multi head attention
class MultiHeadAttention(nn.Module): 
    def  __init__(self, config) -> None:
        super().__init__()
        self.heads = nn.ModuleList(
            [
                SingleHeadAttention(config)
                for _ in range(config.n_head)
            ]
        )
        self.proj = nn.Linear(config.hidden_dim, config.hidden_dim)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x): 
        output = torch.cat(
            [h(x) for h in self.heads],
            dim=-1
        )
        output = self.proj(output)
        output = self.dropout(output)
        return output

# 3. feed forward (MLP)
class FeedForward(nn.Module): 
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config.hidden_dim, 4 * config.hidden_dim), # swiglu # 8/3
            nn.GELU(), # activation layer
            nn.Linear(4 * config.hidden_dim, config.hidden_dim), 
            nn.Dropout(config.dropout)
        )

    def forward(self, x):
        return self.net(x)
    
# 4. block
class Block(nn.Module): 
    def __init__(self, config): 
        super().__init__()
        self.att = MultiHeadAttention(config)
        self.ffn = FeedForward(config)
        self.ln1 = nn.LayerNorm(config.hidden_dim)
        self.ln2 = nn.LayerNorm(config.hidden_dim)

    def forward(self, x): 
        x = x + self.att(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x 
    

# 5. GPT
class GPT(nn.Module): 
    def __init__(self, config): 
        super().__init__()
        # (embedding, position, norm, mlp, block )
        # position embdeeing from 0, 1, ... embedding -> ROPE
        # norm: layer  norm -> rms norm
        # mlp -> swiglu
        # mha -> gqa
        self.block_size = config.block_size
        self.token_embedding_table = nn.Embedding(config.vocab_size, config.n_embd)
        self.position_embedding_table = nn.Embedding(config.block_size, config.n_embd)
        self.blocks = nn.Sequential(
            *[Block(config) for _ in range(config.n_layer)]
        )
        self.ln_final = nn.LayerNorm(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # no bias as going to get softmax 
        # SLM model nowadays will use tie_weight to reduce parameters

        # **important!!**
        # in a small model, if the there are too many parameters related to the embedding
        # the less knowledge the model would learn
        # so we reduce the parameters related to embedding by tie_weight
        # why it can be tied?
        # linear(4 -> 8), the shape of weight is 8 * 4
        self.token_embedding_table.weight = self.lm_head.weight

    def __init__weights(self, module): # init like Gaussian Distribution
        if isinstance(module, nn.Lienar): 
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None: 
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding): 
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None): 
        # idx: token ids
        # target: target token ids
        # --> shape of idx and target should be the same
        batch, seq_len = idx.size() # (batch, seq_len)
        token_emb = self.token_embedding_table(idx) # (batch, seq_len, n_embd)
        pos_emb = self.position_embedding_table(
            # ensure that the position encoding and the input idx to be on the 
            # same device
            # why?
            # we want to learn the position embedding table
            # arrange would start from 0 to the seq_len
            torch.arange(seq_len, device=idx.device)
        )

        # question: why we can sum up token embedding and position embedding? 

        x = token_emb + pos_emb     # shape is (batch, seq_len, n_embd)
        x = self.blocks(x)
        x = self.ln_final(x)
        logits = self.lm_head(x)    # shape is (batch, seq_len, vocab_size)
        if targets is None: 
            loss = None
        else: 
            batch, seq_len, vocab_size = logits.size()
            logits = logits.view(batch * seq_len, vocab_size)
            targets = targets.view(batch * seq_len)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens): 

        # idx shape (batch, seq_len)
        for _ in range(max_new_tokens): 
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            logits, _ = self(idx_cond)
            
            # shape: (batch, seq_len, vocab_size)
            logits = logits[:, -1, :] # take the last one since tokens before that are input tokens

            probs = F.softmax(logits, dim=-1)
            # random sample
            idx_next = torch.multinomial(probs, num_samples=1)
            
            # concat the new token with the prompt
            idx = torch.cat((idx, idx_next), dim=1) # shape: (batch, seq_len + 1)
        return idx


        pass #TODO




### IV. Construct the input dataset

##### Understand what the input be like (Most Important)

In [4]:
class MyDataset(Dataset): 
    def __init__(self, path, block_size=512): 
        # read the first 1000 lines 
        import tiktoken
        self.enc = tiktoken.get_encoding("gpt2")
        self.block_size = block_size # pos max_len

        self.encoded_data = []
        # using special character to split different training text
        # GPT's: <|endoftext|>
        self.eos_token = self.enc.encode(
            "<|endoftext|>", 
            allowed_special={"<|endoftext|>"}
        )[0]
        
        self.max_lines = 1000

        # Deserialization
        import json

        # processin the raw data to pkl or numpy format
        # then load to the GPU memory

        raw_data = []
        with open(path, 'r', encoding='utf-8', errors='ignore') as f: 
            for i, line in enumerate(f): 
                if i >= self.max_lines: 
                    break
                try: 
                    text = json.loads(line.strip())["text"]
                    raw_data.append(text)
                except Exception as e: 
                    continue
                        
        full_encoded = []
        for text in raw_data: 
            # combine all the text
            # and divide them by the 
            # special character
            encoded_text = self.enc.encode(text)
            full_encoded.extend(encoded_text + [self.eos_token])
        
        # block_size = 512
        # split the long text -> short (512)
        for i in range(0, len(full_encoded), self.block_size):
            chunk = full_encoded[i:i+self.block_size] # 512 # actually 513 for each line
            if len(chunk) < self.block_size + 1: 
                # you can also drop it if it is not a multiple of 512
                chunk = chunk + [self.eos_token] * (self.block_size + 1 - len(chunk))
            self.encoded_data.append(chunk)


    def __len__(self): 
        return len(self.encoded_data)
    
    def __getitem__(self, idx): 
        chunk = self.encoded_data[idx]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y
    
    def encode(self, text): 
        """encode the text to token IDs"""
        return self.enc.encode(text)
    
    def decod(self, ids): 
        """decode the token IDs to text"""
        return self.enc.decode(ids)






### V. Run the functions

In [5]:
# import chardet

# with open('openwebtext.jsonl', 'rb') as f:
#     raw_data = f.read(10000)  # Read first 10KB to detect encoding
#     result = chardet.detect(raw_data)
#     print(result)

In [6]:
# from datasets import load_dataset
# import json

# dataset_stream = load_dataset("openwebtext", split="train", streaming=True, trust_remote_code=True)

# with open("./openwebtext.jsonl", "w", encoding="utf-8", errors='ignore') as f: 

#     for sample in dataset_stream:
#         text_data = sample["text"].replace("\n", "\\n")
#         record = {"text": text_data}
#         line_json = json.dumps(record, ensure_ascii=False)
#         f.write(line_json + "\n")

        # print(sample["text"])

In [7]:
# train data
trainig_data_set_path = 'openwebtext.jsonl'
train_dataset = MyDataset(trainig_data_set_path)

# split traindataset to train and val
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [0.9, 0.1])

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=12, shuffle=False)

In [8]:
model = GPT(GPTConfig())
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# print the parameters of the model

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params / 1e6} M")

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
# 设置cosine 学习率
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000)

Total parameters: 124.046592 M


In [9]:
# training loop
def train(model, optimizer, scheduler, train_loader, val_loader, device, epoch): 
    model.train()
    total_loss = 0
    for batch_idx, (x, y) in enumerate(train_loader): 
        # move the data to the device
        x, y = x.to(device), y.to(device)

        # forward propagation 
        logits, loss = model(x, targets=y)

        # back propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # adjust the learning rate
        scheduler.step()

        total_loss += loss.item()

        if batch_idx % 100 == 0: 
            print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}')
        return total_loss
    

def eval(model, val_loader, device): 
    # prove/evaluation
    model.eval()
    val_loss = 0
    with torch.no_grad(): 
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits, loss = model(x, targets=y)
            val_loss += loss.item()
    return val_loss


val_losses = []

for epoch in range(1000): 
    train_loss = train(model, optimizer, scheduler, train_loader, val_loader, device, epoch)
    val_loss = eval(model, val_loader, device)
    print(f'Epoch: {epoch}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

    # save model
    avg_val_loss = val_loss / len(val_loader)
    checkpoint = {
        'epoch': epoch, 
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(), 
        'scheduler_state_dict': scheduler.state_dict(),
        'val_loss': avg_val_loss,
    }

    val_losses.append(avg_val_loss)
    # save model of each epoch
    if (epoch + 1) % 1000 == 0:
        torch.save(checkpoint, f'checkpoints3/model_epoch_{epoch+1}.pt')


Epoch: 0, Batch: 0, Loss: 10.9832
Epoch: 0, Train Loss: 0.0642, Val Loss: 10.0863
Epoch: 1, Batch: 0, Loss: 10.0932
Epoch: 1, Train Loss: 0.0590, Val Loss: 9.6039
Epoch: 2, Batch: 0, Loss: 9.6466
Epoch: 2, Train Loss: 0.0564, Val Loss: 9.3389
Epoch: 3, Batch: 0, Loss: 9.3372
Epoch: 3, Train Loss: 0.0546, Val Loss: 9.1214
Epoch: 4, Batch: 0, Loss: 9.0803
Epoch: 4, Train Loss: 0.0531, Val Loss: 8.9191
Epoch: 5, Batch: 0, Loss: 8.9432
Epoch: 5, Train Loss: 0.0523, Val Loss: 8.7255
Epoch: 6, Batch: 0, Loss: 8.6562
Epoch: 6, Train Loss: 0.0506, Val Loss: 8.5424
Epoch: 7, Batch: 0, Loss: 8.5622
Epoch: 7, Train Loss: 0.0501, Val Loss: 8.3758
Epoch: 8, Batch: 0, Loss: 8.7010
Epoch: 8, Train Loss: 0.0509, Val Loss: 8.2269
Epoch: 9, Batch: 0, Loss: 8.2840
Epoch: 9, Train Loss: 0.0484, Val Loss: 8.0938
Epoch: 10, Batch: 0, Loss: 8.1063
Epoch: 10, Train Loss: 0.0474, Val Loss: 7.9786
Epoch: 11, Batch: 0, Loss: 8.0692
Epoch: 11, Train Loss: 0.0472, Val Loss: 7.8826
Epoch: 12, Batch: 0, Loss: 7.9318

In [10]:
# ... existing code ...

# Load your trained model checkpoint if needed
checkpoint = torch.load('checkpoints3/model_epoch_1000.pt')
model.load_state_dict(checkpoint['model_state_dict'])

train_dataset = MyDataset(trainig_data_set_path)

# Prepare the input
prompt = "Once upon a time"  # Your starting text
encoded_prompt = train_dataset.encode(prompt)  # Encode the prompt using your dataset's encoder

# Convert to tensor and add batch dimension
input_ids = torch.tensor([encoded_prompt], dtype=torch.long).to(device)

# Generate text
generated_ids = model.generate(input_ids, max_new_tokens=100)  # Generate 100 new tokens

# Decode the generated tokens
generated_text = train_dataset.decod(generated_ids[0].tolist())  # Remove batch dimension and decode

print(generated_text)

  checkpoint = torch.load('checkpoints3/model_epoch_1000.pt')


Once upon a time, adhereaddle of many. (lein effectively's his problemging stage to the mid, we theGF copies took were but anyone for my state. Bou".[ Bitcoin speech."�M's their band from faced was going.” 4 beyond says media are the Poly to weaker experiments, the existing this 184. I as 300iam of the BARs beyond pass, W unchanged will make nuclear back nevertheless or this Laur: way to much investigation. He evidence to the logo mindset to Erit assault the
