### pretraining script

In [None]:
# from datasets import load_dataset, DatasetDict
# from tqdm.auto import tqdm

# print("Loading the BookCorpus dataset...")

# # Load dataset in batches with progress bar
# batch_size = 1000000  # Load 1 million rows at a time
# dataset = load_dataset("SamuelYang/bookcorpus")
# total_rows = len(dataset['train'])

# print(f"Total rows in dataset: {total_rows}")

# # Initialize empty lists to store batches
# all_data = []

# # Process dataset in batches with progress bar
# for start_idx in tqdm(range(0, total_rows, batch_size), desc="Loading batches"):
#     end_idx = min(start_idx + batch_size, total_rows)
#     batch = dataset['train'].select(range(start_idx, end_idx))
#     all_data.extend(batch['text'])

# # Create final dataset
# dataset = DatasetDict({
#     'train': dataset['train'].select(range(total_rows))
# })

# print("\nDataset structure:")
# print(dataset)

# print("\nFirst example:")
# print(dataset['train'][0])

# print("\nComplete dataset loaded successfully.")

  from .autonotebook import tqdm as notebook_tqdm


Loading the BookCorpus dataset...
Total rows in dataset: 74004228


Loading batches: 100%|██████████| 75/75 [08:32<00:00,  6.83s/it]



Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 74004228
    })
})

First example:
{'text': 'the half-ling book one in the fall of igneeria series kaylee soderburg copyright 2013 kaylee soderburg all rights reserved .'}

Complete dataset loaded successfully.


loading the first 1cr rows

In [None]:
from datasets import load_dataset, DatasetDict
from tqdm.auto import tqdm

print("Loading the BookCorpus dataset...")

# Load dataset in batches with progress bar
batch_size = 100000  # Load 1 million rows at a time
dataset = load_dataset("SamuelYang/bookcorpus")
total_rows = min(1000000, len(dataset['train']))  # Limit to 1cr rows

print(f"Total rows in dataset: {total_rows}")

# Initialize empty lists to store batches
all_data = []

# Process dataset in batches with progress bar
for start_idx in tqdm(range(0, total_rows, batch_size), desc="Loading batches"):
    end_idx = min(start_idx + batch_size, total_rows)
    batch = dataset['train'].select(range(start_idx, end_idx))
    all_data.extend(batch['text'])

# Create final dataset with only 1cr rows
dataset = DatasetDict({
    'train': dataset['train'].select(range(total_rows))
})

print("\nDataset structure:")
print(dataset)

print("\nFirst example:")
print(dataset['train'][0])

print("\nComplete dataset loaded successfully.")

  from .autonotebook import tqdm as notebook_tqdm


Loading the BookCorpus dataset...


In [3]:
# !pip install transformers datasets numpy tqdm ml_collections

In [None]:
import os

# Set a new cache directory on your H: drive
new_cache_dir = "H:\SuperTails\huggingface_cache" 
os.makedirs(new_cache_dir, exist_ok=True) # Create the directory if it doesn't exist

# Set the environment variables
os.environ['HF_HOME'] = new_cache_dir
os.environ['HF_DATASETS_CACHE'] = os.path.join(new_cache_dir, 'datasets')
os.environ['TRANSFORMERS_CACHE'] = os.path.join(new_cache_dir, 'transformers')



import os
import math
import time
import inspect
from dataclasses import dataclass
from contextlib import nullcontext

import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm

from datasets import load_dataset, DatasetDict
from transformers import GPT2Tokenizer

# --- 1. Configuration ---

class SimpleConfig:
    # I/O
    out_dir = 'out'
    eval_interval = 200
    log_interval = 1
    eval_iters = 100
    always_save_checkpoint = True

    # Data
    dataset = 'SamuelYang/bookcorpus'
    gradient_accumulation_steps = 4
    batch_size = 12
    block_size = 256 # context of up to 256 previous characters

    # Model
    n_layer = 6
    n_head = 6
    n_embd = 384
    dropout = 0.2
    bias = False

    # AdamW optimizer
    learning_rate = 1e-3
    max_iters = 2000
    weight_decay = 1e-1
    beta1 = 0.9
    beta2 = 0.95
    grad_clip = 1.0

    # Learning rate decay settings
    decay_lr = True
    warmup_iters = 100
    lr_decay_iters = 2000
    min_lr = 1e-4

# --- 2. Model Definition ---

@dataclass
class ModelConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True

class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class SelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = SelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class Pet_SLM(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            logits = self.lm_head(x[:, [-1], :])
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")

        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# --- 3. Data Loading and Preprocessing ---

def process_and_save_data(config):
    # Load dataset from Hugging Face
    dataset = load_dataset(config.dataset, split='train')

    # Split dataset
    train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
    ds = DatasetDict({
        'train': train_test_split['train'],
        'validation': train_test_split['test']
    })

    # Initialize tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.add_special_tokens({'pad_token': '<PAD>', 'eos_token': '<EOS>'})

    def tokenize_function(examples):
        return tokenizer(examples['text'], truncation=True, max_length=config.block_size, padding="max_length")

    tokenized_ds = ds.map(
        tokenize_function,
        batched=True,
        num_proc=4,
        remove_columns=["text"],
        load_from_cache_file=False,
        desc="Tokenizing dataset"
    )

    for split, d in tokenized_ds.items():
        arr_len = len(d)
        filename = os.path.join(config.out_dir, f'{split}.bin')
        dtype = np.uint16
        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len, config.block_size))

        for i, example in enumerate(tqdm(d, desc=f"Writing {filename}")):
            arr[i] = example['input_ids']
        arr.flush()

# --- 4. Training Loop ---

def get_lr(it, config):
    if it < config.warmup_iters:
        return config.learning_rate * it / config.warmup_iters
    if it > config.lr_decay_iters:
        return config.min_lr
    decay_ratio = (it - config.warmup_iters) / (config.lr_decay_iters - config.warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return config.min_lr + coeff * (config.learning_rate - config.min_lr)

from itertools import cycle

def get_batch_from_stream(data_stream, tokenizer, batch_size, block_size, device):
    """
    Generator function to yield batches of tokenized data from a streaming dataset.
    """
    batch_texts = []
    for example in data_stream:
        # Append the text of the current example
        batch_texts.append(example['text'])
        
        # If we have a full batch, process and yield it
        if len(batch_texts) == batch_size:
            # Tokenize the batch of texts
            tokenized = tokenizer(
                batch_texts, 
                truncation=True, 
                max_length=block_size, 
                padding="max_length", 
                return_tensors="pt"
            )
            
            x = tokenized.input_ids
            # Create the target sequence by shifting the input
            y = torch.roll(x, shifts=-1, dims=1)
            y[:, -1] = -1 # Ignore the last token in the target for loss calculation

            # Move tensors to the correct device
            if device == 'cuda':
                x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
            else:
                x, y = x.to(device), y.to(device)

            yield x, y
            
            # Clear the list for the next batch
            batch_texts = []


@torch.no_grad()
def estimate_loss(model, config, device, val_data_loader, tokenizer): # Modified signature
    out = {}
    model.eval()
    for split in ['validation']: # Only evaluate validation split for speed
        losses = torch.zeros(config.eval_iters)
        for k in range(config.eval_iters):
            X, Y = next(val_data_loader) # Use the passed loader
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    # You can approximate train loss from the last training batch if needed
    # to avoid creating a separate train data loader for estimation.
    out['train'] = loss.item() * config.gradient_accumulation_steps 
    model.train()
    return out

def train_model(config, train_data_loader, val_data_loader, tokenizer): # <-- Accept data loaders and tokenizer
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"--- Starting training on device: {device.upper()} ---") 
    
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=torch.bfloat16)

    # Use the tokenizer's vocab size, which includes special tokens
    model_config = ModelConfig(
        n_layer=config.n_layer, n_head=config.n_head, n_embd=config.n_embd,
        block_size=config.block_size, bias=config.bias, vocab_size=len(tokenizer), dropout=config.dropout
    )
    model = Pet_SLM(model_config)
    model.to(device)

    scaler = torch.cuda.amp.GradScaler(enabled=(torch.float16 == torch.bfloat16))
    optimizer = model.configure_optimizers(config.weight_decay, config.learning_rate, (config.beta1, config.beta2), device)
    
    iter_num = 0
    best_val_loss = 1e9

    for iter_num in tqdm(range(config.max_iters), desc="Training"):
        lr = get_lr(iter_num, config) if config.decay_lr else config.learning_rate
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        if iter_num > 0 and iter_num % config.eval_interval == 0:
            losses = estimate_loss(model, config, device, val_data_loader, tokenizer) # Pass loader and tokenizer
            print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['validation']:.4f}")
            if losses['validation'] < best_val_loss:
                best_val_loss = losses['validation']
        
        for _ in range(config.gradient_accumulation_steps):
            # Get the next batch from the data loader generator
            X, Y = next(train_data_loader)
            with ctx:
                logits, loss = model(X, Y)
                loss = loss / config.gradient_accumulation_steps
            scaler.scale(loss).backward()
        
        if config.grad_clip != 0.0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

    # --- SAVE FINAL MODEL AT THE END ---
    final_checkpoint = {
        'model': model.state_dict(),
        'model_args': model_config,
        'config': config,
    }
    print(f"\nTraining complete. Saving final model to {config.out_dir}")
    torch.save(final_checkpoint, os.path.join(config.out_dir, 'final_model.pt'))


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
if __name__ == '__main__':
    config = SimpleConfig()
    os.makedirs(config.out_dir, exist_ok=True)
    
    # --- 1. SETUP DATA STREAMS (No saving to disk) ---
    print("Setting up data streams...")
    # It's safer to use forward slashes for paths
    new_cache_dir = "H:/SuperTails/huggingface_cache" 
    os.makedirs(new_cache_dir, exist_ok=True)
    os.environ['HF_HOME'] = new_cache_dir
    os.environ['HF_DATASETS_CACHE'] = os.path.join(new_cache_dir, 'datasets')

    # Load dataset in streaming mode
    full_dataset = load_dataset(config.dataset, split='train', streaming=True)
    
    # Split the stream for train and validation
    train_stream = full_dataset.take(70000000) # Use a large portion for training
    val_stream = full_dataset.skip(70000000).take(10000) # Use a small portion for validation

    # Initialize tokenizer and add special tokens
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.add_special_tokens({'pad_token': '<PAD>', 'eos_token': '<EOS>'})
    # Set the pad_token_id for the tokenizer
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('<PAD>')

    # --- 2. CREATE DATA LOADERS ---
    # Use itertools.cycle to endlessly loop over the stream
    train_data_loader = get_batch_from_stream(
        cycle(train_stream), tokenizer, config.batch_size, config.block_size, 'cuda' if torch.cuda.is_available() else 'cpu'
    )
    val_data_loader = get_batch_from_stream(
        cycle(val_stream), tokenizer, config.batch_size, config.block_size, 'cuda' if torch.cuda.is_available() else 'cpu'
    )

    # --- 3. START TRAINING ---
    train_model(config, train_data_loader, val_data_loader, tokenizer)

    print("\nProcess finished.")

Setting up data streams...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


--- Starting training on device: CUDA ---


  scaler = torch.cuda.amp.GradScaler(enabled=(torch.float16 == torch.bfloat16))


number of parameters: 29.92M
num decayed parameter tensors: 26, with 30,014,592 parameters
num non-decayed parameter tensors: 13, with 4,992 parameters
using fused AdamW: True


Training:   6%|▋         | 126/2000 [00:11<02:54, 10.77it/s]


KeyboardInterrupt: 

checking if its running on GPU or not

In [None]:
import torch

if torch.cuda.is_available():
    # Get the number of GPUs
    device_count = torch.cuda.device_count()
    print(f"Found {device_count} CUDA-enabled GPU(s).")

    # Get the name of the current GPU
    current_device_index = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(current_device_index)
    
    print(f"✅ --- Successfully using GPU {current_device_index}: {gpu_name} --- ✅")

else:
    print("❌ --- PyTorch cannot find a CUDA-enabled GPU. Running on CPU. --- ❌")


Found 1 CUDA-enabled GPU(s).
✅ --- Successfully using GPU 0: NVIDIA GeForce RTX 4090 --- ✅


In [None]:
import torch

# 1. Is CUDA available?
is_available = torch.cuda.is_available()
print(f"Is CUDA available? {is_available}")

if is_available:
    # 2. How many GPUs can PyTorch see?
    device_count = torch.cuda.device_count()
    print(f"Number of GPUs available: {device_count}")

    # 3. What is the name of the current GPU?
    current_device = torch.cuda.current_device()
    device_name = torch.cuda.get_device_name(current_device)
    print(f"Current GPU index: {current_device}")
    print(f"Current GPU name: {device_name}")
else:
    print("PyTorch cannot find a CUDA-enabled GPU.")

# 4. What version of PyTorch and CUDA is it built with?
print(f"PyTorch version: {torch.__version__}")
if is_available:
    print(f"PyTorch built with CUDA version: {torch.version.cuda}")

Is CUDA available? True
Number of GPUs available: 1
Current GPU index: 0
Current GPU name: NVIDIA GeForce RTX 4090
PyTorch version: 2.7.1+cu118
PyTorch built with CUDA version: 11.8


In [None]:
import torch

# Is CUDA available?
is_available = torch.cuda.is_available()
print(f"Is CUDA available? {is_available}")

if not is_available:
    print("PyTorch cannot find a CUDA-enabled GPU. This is the root of the problem.")
else:
    print(f"Great! CUDA is available.")
    print(f"PyTorch version: {torch.__version__}")
    print(f"PyTorch built with CUDA version: {torch.version.cuda}")

Is CUDA available? True
Great! CUDA is available.
PyTorch version: 2.7.1+cu118
PyTorch built with CUDA version: 11.8


###  Fine-Tuning Script

In [None]:
import os
import math
import time
import inspect
import json # New: for loading jsonl file
from dataclasses import dataclass
from contextlib import nullcontext

import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm

from transformers import GPT2Tokenizer

# --- Configuration (Adjusted for Fine-tuning) ---

class SimpleConfig:
    # I/O
    out_dir = 'out'
    eval_interval = 50   # Evaluate more often
    log_interval = 1
    eval_iters = 20
    always_save_checkpoint = True
    
    # New: Path to the pre-trained model
    init_from = 'resume' # Can be 'scratch' or 'resume'
    pretrained_ckpt_path = os.path.join(out_dir, 'ckpt.pt')

    # Data
    finetune_data_path = 'unified.jsonl' # New: Path to your jsonl file
    gradient_accumulation_steps = 1
    batch_size = 4  # Use a smaller batch size
    block_size = 256

    # Model (keep the same as pre-training)
    n_layer = 6
    n_head = 6
    n_embd = 384
    dropout = 0.2
    bias = False

    # AdamW optimizer (Adjusted for Fine-tuning)
    learning_rate = 3e-5  # Much lower learning rate
    max_iters = 200      # Fewer iterations are needed
    weight_decay = 1e-1
    beta1 = 0.9
    beta2 = 0.95
    grad_clip = 1.0

    # Learning rate decay settings
    decay_lr = True
    warmup_iters = 20
    lr_decay_iters = 200 # Should be same as max_iters
    min_lr = 3e-6        # Lower min learning rate

# --- Model Definition (Same as before) ---
# ... (The Pet_SLM model and its components are identical, so they are omitted for brevity)
# ... (Just copy the entire Model Definition section from the previous script here)
@dataclass
class ModelConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True

class LayerNorm(nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class SelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        if self.flash:
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = SelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class Pet_SLM(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd, bias=config.bias),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            logits = self.lm_head(x[:, [-1], :])
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")

        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        for _ in range(max_new_tokens):
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx


# --- NEW: Data Loading and Preprocessing for Fine-tuning ---

def prepare_finetuning_data(config):
    # Load and process the .jsonl file
    all_text = []
    with open(config.finetune_data_path, 'r', encoding='utf-8') as f:
        for line in f:
            # The <EOS> token is a separator, not part of the JSON
            json_str = line.strip().replace('<EOS>', '').strip()
            if json_str:
                data = json.loads(json_str)
                all_text.append(data.get('text', '')) # Safely get the text

    # Split data (90% train, 10% validation)
    train_size = int(0.9 * len(all_text))
    train_data = all_text[:train_size]
    val_data = all_text[train_size:]

    # Initialize tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token # set pad token

    # Tokenize and save to binary files
    for split, data in [('train', train_data), ('validation', val_data)]:
        tokenized_texts = tokenizer(data, truncation=True, max_length=config.block_size, padding="max_length", return_tensors="np")
        
        filename = os.path.join(config.out_dir, f'{split}_finetune.bin')
        dtype = np.uint16
        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=tokenized_texts['input_ids'].shape)
        
        arr[:] = tokenized_texts['input_ids']
        arr.flush()
        print(f"Saved {split} data to {filename}")


# --- Training Loop (Adapted for Fine-tuning) ---

def get_lr(it, config):
    if it < config.warmup_iters:
        return config.learning_rate * it / config.warmup_iters
    if it > config.lr_decay_iters:
        return config.min_lr
    decay_ratio = (it - config.warmup_iters) / (config.lr_decay_iters - config.warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return config.min_lr + coeff * (config.learning_rate - config.min_lr)

def get_batch(split, config, device):
    # Load from the new fine-tuning binary files
    data_path = os.path.join(config.out_dir, f'{split}_finetune.bin')
    data = np.memmap(data_path, dtype=np.uint16, mode='r')
    
    # Reshape data to (num_samples, block_size)
    data = data.reshape(-1, config.block_size)

    # Get random indices
    ix = torch.randint(len(data), (config.batch_size,))
    
    # Grab the batches
    x = torch.from_numpy(data[ix]).to(torch.int64)
    y = torch.from_numpy(data[ix]).to(torch.int64) # In causal LM, y is the same as x
    
    # Move to device
    if device == 'cuda':
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model, config, device):
    out = {}
    model.eval()
    for split in ['train', 'validation']:
        losses = torch.zeros(config.eval_iters)
        for k in range(config.eval_iters):
            X, Y = get_batch(split, config, device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def finetune_model(config):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=torch.bfloat16)

    # --- Model Loading ---
    if config.init_from == 'resume':
        print(f"Resuming training from {config.pretrained_ckpt_path}")
        checkpoint = torch.load(config.pretrained_ckpt_path, map_location=device)
        model_args = checkpoint['model_args']
        model = Pet_SLM(model_args)
        model.load_state_dict(checkpoint['model'])
    else:
        print("Initializing a new model from scratch")
        model_config = ModelConfig(
            n_layer=config.n_layer, n_head=config.n_head, n_embd=config.n_embd,
            block_size=config.block_size, bias=config.bias, vocab_size=50257, dropout=config.dropout
        )
        model = Pet_SLM(model_config)
    
    model.to(device)

    scaler = torch.cuda.amp.GradScaler(enabled=(torch.float16 == torch.bfloat16))
    optimizer = model.configure_optimizers(config.weight_decay, config.learning_rate, (config.beta1, config.beta2), device)
    
    iter_num = 0
    best_val_loss = 1e9
    if config.init_from == 'resume': # If resuming, load optimizer state as well
        optimizer.load_state_dict(checkpoint['optimizer'])
        best_val_loss = checkpoint['best_val_loss']
        iter_num = checkpoint['iter_num']


    for iter_num in tqdm(range(config.max_iters), desc="Fine-tuning"):
        lr = get_lr(iter_num, config) if config.decay_lr else config.learning_rate
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        if iter_num % config.eval_interval == 0 and iter_num > 0:
            losses = estimate_loss(model, config, device)
            print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['validation']:.4f}")
            if losses['validation'] < best_val_loss or config.always_save_checkpoint:
                best_val_loss = losses['validation']
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {config.out_dir}")
                # Save to a new fine-tuned checkpoint file
                torch.save(checkpoint, os.path.join(config.out_dir, 'ckpt_finetuned.pt'))
        
        for _ in range(config.gradient_accumulation_steps):
            X, Y = get_batch('train', config, device)
            with ctx:
                logits, loss = model(X, Y)
                loss = loss / config.gradient_accumulation_steps
            scaler.scale(loss).backward()
        
        if config.grad_clip != 0.0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)


In [None]:

# if __name__ == '__main__':
#     config = SimpleConfig()
#     os.makedirs(config.out_dir, exist_ok=True)
    
#     # 1. Process and save the new fine-tuning data
#     prepare_finetuning_data(config)
    
#     # 2. Run the fine-tuning process
#     finetune_model(config)

continue running from here

In [None]:
config = SimpleConfig()
os.makedirs(config.out_dir, exist_ok=True)

In [None]:
# 1. Process and save the new fine-tuning data
prepare_finetuning_data(config)

In [None]:
# 2. Run the fine-tuning process
finetune_model(config)