# Clone the NanoGPT repository and install necessary dependencies

In [None]:
# Clone the NanoGPT repository
!git clone https://github.com/karpathy/nanoGPT.git

# Navigate into the nanoGPT directory
%cd nanoGPT

# Install required packages
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
!pip install transformers datasets tiktoken wandb tqdm numpy

Cloning into 'nanoGPT'...
remote: Enumerating objects: 682, done.[K
remote: Total 682 (delta 0), reused 0 (delta 0), pack-reused 682 (from 1)[K
Receiving objects: 100% (682/682), 952.47 KiB | 17.32 MiB/s, done.
Resolving deltas: 100% (385/385), done.
/content/nanoGPT
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting wandb
  Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_1

# Prepare the enwik8 Dataset

In [None]:
from datasets import load_dataset

# Load the enwik8 dataset
dataset = load_dataset("LTCB/enwik8", split="train")

# Get the text data
text_data = ''.join(dataset['text'])

# Total number of characters
total_chars = len(text_data)
print(f"Total number of characters: {total_chars}")

# Set the number of characters for the training set
num_train_chars = 90_000_000

# Ensure we have enough characters for training
assert total_chars >= num_train_chars, "Not enough data for 90 million training characters."

# Calculate remaining characters for validation and testing
remaining_chars = total_chars - num_train_chars

# Allocate up to 5 million characters for validation
num_valid_chars = min(5_000_000, remaining_chars)

# Allocate remaining characters for testing
num_test_chars = remaining_chars - num_valid_chars

# Extract the splits
train_text = text_data[:num_train_chars]
valid_text = text_data[num_train_chars:num_train_chars + num_valid_chars]
test_text = text_data[num_train_chars + num_valid_chars:]

# Save to files
with open('train.txt', 'w') as f:
    f.write(train_text)
with open('valid.txt', 'w') as f:
    f.write(valid_text)
with open('test.txt', 'w') as f:
    f.write(test_text)

print("Data saved to train.txt, valid.txt, and test.txt")

# Verify sizes
print(f"Training characters: {len(train_text)}")
print(f"Validation characters: {len(valid_text)}")
print(f"Test characters: {len(test_text)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


enwik8.py:   0%|          | 0.00/2.94k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.28k [00:00<?, ?B/s]

The repository for LTCB/enwik8 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/LTCB/enwik8.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/36.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1128024 [00:00<?, ? examples/s]

Total number of characters: 97492430
Data saved to train.txt, valid.txt, and test.txt
Training characters: 90000000
Validation characters: 5000000
Test characters: 2492430


# Prepare the Data for NanoGPT

In [None]:
# Create a directory for the dataset
!mkdir -p data/enwik8

# Move the data files into the dataset directory
!mv train.txt valid.txt test.txt data/enwik8/

In [None]:
# Create a new script called prepare_enwik8.py
%%writefile data/prepare_enwik8.py
import os
import pickle
import numpy as np

# Define the data directory
data_dir = 'data/enwik8'

# Read the text files
with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f:
    train_data = f.read()
with open(os.path.join(data_dir, 'valid.txt'), 'r', encoding='utf-8') as f:
    val_data = f.read()
with open(os.path.join(data_dir, 'test.txt'), 'r', encoding='utf-8') as f:
    test_data = f.read()

# Get all unique characters from the training set
chars = sorted(list(set(train_data)))
vocab_size = len(chars)
print(f"Unique characters: {vocab_size}")

# Create mappings from characters to integers and vice versa
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

# Save the mappings for later use
meta = {
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
}
with open(os.path.join(data_dir, 'meta.pkl'), 'wb') as f:
    pickle.dump(meta, f)

# Encode the data and convert to numpy arrays
def encode(s):
    # Only encode characters that exist in the stoi dictionary
    return [stoi[c] for c in s if c in stoi]

train_ids = np.array(encode(train_data), dtype=np.uint16)
val_ids = np.array(encode(val_data), dtype=np.uint16)
test_ids = np.array(encode(test_data), dtype=np.uint16)

# Save the encoded data to binary files
train_ids.tofile(os.path.join(data_dir, 'train.bin'))
val_ids.tofile(os.path.join(data_dir, 'val.bin'))
test_ids.tofile(os.path.join(data_dir, 'test.bin'))

print("Data preparation complete.")

Writing data/prepare_enwik8.py


In [None]:
!python data/prepare_enwik8.py

Unique characters: 5486
Data preparation complete.


In [None]:
%%writefile config/enwik8_char_modified.py
import math

# Configuration for the modified model
out_dir = 'out-enwik8-char-modified'  # Output directory for model checkpoints and logs
eval_interval = 500
eval_iters = 200
log_interval = 100

always_save_checkpoint = True  # Changed to True to ensure we save checkpoints
wandb_log = False
wandb_project = 'enwik8-char'
wandb_run_name = 'gpt2-enwik8-char-modified'

dataset = 'enwik8'
gradient_accumulation_steps = 1
batch_size = 64  # Adjust based on your GPU memory
block_size = 256  # Context length

# Model parameters
n_layer = 10
n_head = 8
n_embd = 512
dropout = 0.1  # Added some dropout for regularization
bias = False  # No bias in LayerNorm and Linear layers

# Optimization parameters
learning_rate = 1e-3
max_iters = 5000  # Increased number of iterations for better training
lr_decay_iters = 5000
min_lr = 1e-4
beta1 = 0.9
beta2 = 0.99
weight_decay = 0.1
grad_clip = 1.0
decay_lr = True
warmup_iters = 100
init_from = 'scratch'  # Initialize model from scratch

# Use the modified model
model_type = 'modified'

# System parameters
device = 'cuda'  # Use CUDA for training
dtype = 'float16'  # Use float16 for faster training
compile = False  # Disable compilation for now

Overwriting config/enwik8_char_modified.py


In [None]:
%%writefile model_modified.py
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from model import GPTConfig, GPT
import inspect

class ModifiedGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # Initialize sinusoidal and learned positional embeddings
        self.pos_emb_sin = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self.pos_emb_learned = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self._init_sin_pos_emb()

        # Initialize weights
        self.apply(self._init_weights)
        # Apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # Report number of parameters
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))

    def get_num_params(self, non_embedding=True):
        """
        Return the number of parameters in the model.
        For non-embedding count (default), the position embeddings get subtracted.
        The token embeddings would too, except due to the parameter sharing these
        params are actually used as weights in the final layer, so we include them.
        """
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def _init_sin_pos_emb(self):
        """Initialize sinusoidal positional embeddings."""
        position = torch.arange(0, self.config.block_size).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.config.n_embd, 2) * (-math.log(10000.0) / self.config.n_embd))
        pe = torch.zeros(1, self.config.block_size, self.config.n_embd)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        with torch.no_grad():
            self.pos_emb_sin.copy_(pe)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        # Token embeddings
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)

        # Positional embeddings (sinusoidal + learned)
        pos_emb = self.pos_emb_sin[:, :t, :] + self.pos_emb_learned[:, :t, :] + self.transformer.wpe(pos)

        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def crop_block_size(self, new_block_size):
        # Model surgery to decrease the block size if necessary
        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
        # but want to use a smaller block size for some smaller, simpler model
        assert new_block_size <= self.config.block_size
        self.config.block_size = new_block_size
        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:new_block_size])
        for block in self.transformer.h:
            if hasattr(block.attn, 'bias'):
                block.attn.bias = block.attn.bias[:,:,:new_block_size,:new_block_size]

    @classmethod
    def from_pretrained(cls, model_type, override_args=None):
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        override_args = override_args or {} # default to empty dict
        # only dropout can be overridden see more notes below
        assert all(k == 'dropout' for k in override_args)
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        print("forcing vocab_size=50257, block_size=1024, bias=True")
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        config_args['bias'] = True # always True for GPT model checkpoints
        # we can override the dropout rate, if desired
        if 'dropout' in override_args:
            print(f"overriding dropout rate to {override_args['dropout']}")
            config_args['dropout'] = override_args['dropout']
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = ModifiedGPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    def estimate_mfu(self, fwdbwd_per_iter, dt):
        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
        # first estimate the number of flops we do per iteration.
        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
        N = self.get_num_params()
        cfg = self.config
        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
        flops_per_token = 6*N + 12*L*H*Q*T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        # express our flops throughput as ratio of A100 bfloat16 peak flops
        flops_achieved = flops_per_iter * (1.0/dt) # per second
        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
        mfu = flops_achieved / flops_promised
        return mfu

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = new_gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

def new_gelu(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

Writing model_modified.py


In [None]:
%%writefile evaluate.py
import torch
import numpy as np
import argparse
import pickle
import math
from model import GPTConfig, GPT
from model_modified import ModifiedGPT  # Import your modified model

def evaluate(model, data_loader, device):
    model.eval()
    losses = []
    with torch.no_grad():
        for x, y in data_loader:
            x = x.to(device)
            y = y.to(device)
            with torch.amp.autocast(device_type=device):  # Updated autocast
                logits, loss = model(x, y)
            losses.append(loss.item())
    return np.mean(losses)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_type', type=str, default='gpt', help='Model type: gpt or modified')
    parser.add_argument('--dataset', type=str, default='enwik8', help='Dataset name')
    parser.add_argument('--checkpoint', type=str, required=True, help='Checkpoint file')
    args = parser.parse_args()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Load the checkpoint
    checkpoint = torch.load(args.checkpoint, map_location=device)

    # Load the model configuration from the checkpoint
    ckpt_config = checkpoint['config']

    # Update vocab_size from the dataset's meta.pkl
    with open(f"data/{args.dataset}/meta.pkl", 'rb') as f:
        meta = pickle.load(f)
    vocab_size = meta['vocab_size']
    ckpt_config['vocab_size'] = vocab_size

    # Filter ckpt_config to only include keys that GPTConfig accepts
    valid_config_keys = ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size', 'dropout']
    model_config_kwargs = {k: ckpt_config[k] for k in valid_config_keys if k in ckpt_config}

    # Model configuration
    model_config = GPTConfig(**model_config_kwargs)

    # Instantiate the model
    if args.model_type == 'modified' or ckpt_config.get('model_type') == 'modified':
        model = ModifiedGPT(model_config)
        print("Using ModifiedGPT model.")
    else:
        model = GPT(model_config)
        print("Using GPT model.")

    # Load the model state
    model.load_state_dict(checkpoint['model'], strict=False)

    model.to(device)

    # Prepare data loader
    block_size = ckpt_config['block_size']
    batch_size = ckpt_config.get('batch_size', 64)  # Default to 64 if not specified

    # Load validation data
    val_data = np.memmap(f'data/{args.dataset}/val.bin', dtype=np.uint16, mode='r')
    val_data = torch.from_numpy(val_data.astype(np.int64))

    # Create sequences of block_size
    num_tokens = len(val_data) - 1  # Subtract 1 to prevent index overflow
    x_tokens = val_data[:num_tokens]
    y_tokens = val_data[1:num_tokens+1]

    # Ensure that the number of tokens is a multiple of block_size
    num_batches = num_tokens // block_size
    x_tokens = x_tokens[:num_batches * block_size]
    y_tokens = y_tokens[:num_batches * block_size]

    # Reshape into batches
    x_batches = x_tokens.view(-1, block_size)
    y_batches = y_tokens.view(-1, block_size)

    val_dataset = torch.utils.data.TensorDataset(x_batches, y_batches)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    # Evaluate
    val_loss = evaluate(model, val_loader, device)
    bpc = val_loss / math.log(2)
    print(f"Validation Loss: {val_loss:.4f}, Bits per character (bpc): {bpc:.4f}")

Writing evaluate.py


In [None]:
!python train.py --config config/enwik8_char_modified.py

Output directory: out-enwik8-char-modified
Tokens per iteration will be: 16,384
Found vocab_size = 5486 (inside data/enwik8/meta.pkl)
number of parameters: 37.36M
Using ModifiedGPT model.
  scaler = torch.cuda.amp.GradScaler(enabled=(config['dtype'] == 'float16'))
num decayed parameter tensors: 45, with 37,468,160 parameters
num non-decayed parameter tensors: 42, with 21,504 parameters
using fused AdamW: True
Number of parameters: 37.49M
Training Progress:   0% 0/5000 [00:00<?, ?it/s]
Step 0: train loss 8.6103, val loss 8.6117
Saved checkpoint to: out-enwik8-char-modified/ckpt.pt
Iter 0: loss 8.6161, time 2211.48ms, mfu -100.00%
Training Progress:   2% 100/5000 [02:41<51:34,  1.58it/s]Iter 100: loss 3.4779, time 631.06ms, mfu 2.00%
Training Progress:   4% 200/5000 [03:44<50:45,  1.58it/s]Iter 200: loss 3.0020, time 634.25ms, mfu 2.00%
Training Progress:   6% 300/5000 [04:47<49:42,  1.58it/s]Iter 300: loss 2.6974, time 634.89ms, mfu 1.99%
Training Progress:   8% 400/5000 [05:51<48:36,  

# Run the Baseline Model

In [None]:
# config/enwik8_char_baseline.py
%%writefile config/enwik8_char_baseline.py
out_dir = 'out-enwik8-char'
eval_interval = 500
eval_iters = 200
log_interval = 100

always_save_checkpoint = True
wandb_log = False
wandb_project = 'enwik8-char'
wandb_run_name = 'gpt2-enwik8-char-baseline'

dataset = 'enwik8'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256

n_layer = 12
n_head = 8
n_embd = 384
dropout = 0.1
bias = False

learning_rate = 5e-4
max_iters = 5000
lr_decay_iters = 5000
min_lr = 1e-5
beta1 = 0.9
beta2 = 0.95
weight_decay = 0.1
grad_clip = 1.0
decay_lr = True
warmup_iters = 100
init_from = 'scratch'

device = 'cuda'
dtype = 'float16'
compile = False

Writing config/enwik8_char_baseline.py


In [None]:
!python train.py --config config/enwik8_char_baseline.py

Output directory: out-enwik8-char
Tokens per iteration will be: 16,384
Found vocab_size = 5486 (inside data/enwik8/meta.pkl)
number of parameters: 23.35M
Using GPT model.
  scaler = torch.cuda.amp.GradScaler(enabled=(config['dtype'] == 'float16'))
num decayed parameter tensors: 50, with 23,438,592 parameters
num non-decayed parameter tensors: 25, with 9,600 parameters
using fused AdamW: True
Number of parameters: 23.45M
Training Progress:   0% 0/5000 [00:00<?, ?it/s]
Step 0: train loss 8.5904, val loss 8.5955
Saved checkpoint to: out-enwik8-char/ckpt.pt
Iter 0: loss 8.6010, time 1576.30ms, mfu -100.00%
Training Progress:   2% 100/5000 [00:54<19:04,  4.28it/s]Iter 100: loss 2.6764, time 234.54ms, mfu 3.45%
Training Progress:   4% 200/5000 [01:18<19:00,  4.21it/s]Iter 200: loss 2.4852, time 242.10ms, mfu 3.44%
Training Progress:   6% 300/5000 [01:42<19:20,  4.05it/s]Iter 300: loss 2.2668, time 248.20ms, mfu 3.42%
Training Progress:   8% 400/5000 [02:07<19:10,  4.00it/s]Iter 400: loss 2.1

In [None]:
!ls data/enwik8/

meta.pkl  test.bin  test.txt  train.bin  train.txt  val.bin  valid.txt


# evaluate base model

In [None]:
!python evaluate.py --model_type gpt --checkpoint out-enwik8-char/ckpt.pt

  checkpoint = torch.load(args.checkpoint, map_location=device)
number of parameters: 23.35M
Using GPT model.
Validation Loss: 1.1195, Bits per character (bpc): 1.6151


# evaluate modified model

In [None]:
!python evaluate.py --model_type modified --checkpoint out-enwik8-char-modified/final_ckpt.pt

  checkpoint = torch.load(args.checkpoint, map_location=device)
number of parameters: 37.36M
Using ModifiedGPT model.
Validation Loss: 1.1037, Bits per character (bpc): 1.5923


In [None]:
!pip install tabulate



In [None]:
from tabulate import tabulate

# Data for the table
data = [
    ["Number of Parameters", "23.35M", "37.36M"],
    ["Validation Loss", "1.1195", "1.1037"],
    ["Bits per Character (BPC)", "1.6151", "1.5923"]
]

# Create the table
headers = ["Metric", "Baseline Model (GPT)", "Modified Model (ModifiedGPT)"]
table = tabulate(data, headers, tablefmt="pipe")

# Print the table
print(table)

# Calculate and print the differences
param_diff = 37.36 - 23.35
loss_diff = 1.1037 - 1.1195
bpc_diff = 1.5923 - 1.6151

print("\nDifferences (Modified - Baseline):")
print(f"Parameter Increase: {param_diff:.2f}M ({param_diff/23.35*100:.2f}% increase)")
print(f"Validation Loss Improvement: {-loss_diff:.4f}")
print(f"BPC Improvement: {-bpc_diff:.4f}")

| Metric                   | Baseline Model (GPT)   | Modified Model (ModifiedGPT)   |
|:-------------------------|:-----------------------|:-------------------------------|
| Number of Parameters     | 23.35M                 | 37.36M                         |
| Validation Loss          | 1.1195                 | 1.1037                         |
| Bits per Character (BPC) | 1.6151                 | 1.5923                         |

Differences (Modified - Baseline):
Parameter Increase: 14.01M (60.00% increase)
Validation Loss Improvement: 0.0158
BPC Improvement: 0.0228
