In [8]:
!pip install wandb
!pip install rotary-embedding-torch
!pip install tiktoken



In [None]:
# ! pip install datasets
# from datasets import load_dataset
# dataset = load_dataset("Skylion007/openwebtext", split="train")

In [9]:
!pip install rotary-embedding-torch



In [10]:
import os
import time
import math
import pickle
import numpy as np
import wandb
from typing import Tuple
import torch
import torch.nn.functional as F
from torch import nn
import tiktoken
import gc

import inspect
from rotary_embedding_torch import RotaryEmbedding

from torch.utils.data import Dataset, DataLoader, random_split
from tiktoken import get_encoding

In [11]:
class ModelConfig:
  batch_size: int = 5
  block_size: int = 1024
  vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency

  dim: int = 512
  n_layers: int = 8
  n_heads: int = 8
  max_seq_len: int = 512
  layer_norm_eps: float = 1e-6
  dropout: float = 0.0
  hidden_dim: int = None
  n_embd: int = 1024
  multiple_of: int = 32
  rope_dim: int = 64
  bias: bool = True

  weight_decay = 1e-1
  betas = (0.9, 0.99)
  eval_iters = 50
  master_process = True
  warmup_iters = 0
  learning_rate = 0.001
  lr_decay_iters = 150
  min_lr = 6e-5
  wandb_log = True
  wandb_project = 'HW5'
  wandb_run_name = 'Pretrain_29_4_v1'
  decay_lr = True
  eval_interval = 100
  eval_only = False
  grad_clip = 1.0
  max_iters = 5000
  gradient_accumulation_steps = 2
  save_checkpoint_iters = 100
  log_interval = 5

In [12]:
! nvidia-smi

Fri May  3 03:29:53 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [13]:
config = ModelConfig()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device_type = 'cuda' if 'cuda' in device else 'cpu'

In [14]:
out_dir = '/content/pretrained_model'

In [16]:
seed_offset = 0

# for key, value in config.items():
#     globals()[key] = value

tokens_per_iter = config.gradient_accumulation_steps * config.batch_size * config.block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

os.makedirs(out_dir, exist_ok=True)

torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

device = 'cuda'
# config['device'] = device
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
# print(dtype)
# config['dtype'] = dtype

# note: float16 data type will automatically use a GradScaler
# ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
# ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype)

data_dir = '/content/data'
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
# train_data = np.memmap(os.path.join(data_dir, 'train.bin'), mode='r')
# Use only 70% of the training data.
train_data = train_data[:int(0.7*len(train_data))]
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
# val_data = np.memmap(os.path.join(data_dir, 'val.bin'), mode='r')

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - config.block_size, (config.batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+config.block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+config.block_size]).astype(np.int64)) for i in ix])
    if device_type == 'cuda':
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

gc.collect()

tokens per iteration will be: 10,240


67

In [26]:
class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        """
        Initialize the RMSNorm normalization layer.
        Args:
            dim (int): The dimension of the input tensor.
            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
        Attributes:
            eps (float): A small value added to the denominator for numerical stability.
            weight (nn.Parameter): Learnable scaling parameter.
        """
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        """
        Apply the RMSNorm normalization to the input tensor.
        Args:
            x (torch.Tensor): The input tensor.
        Returns:
            torch.Tensor: The normalized tensor.
        """
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        """
        Forward pass through the RMSNorm layer.
        Args:
            x (torch.Tensor): The input tensor.
        Returns:
            torch.Tensor: The output tensor after applying RMSNorm.
        """
        output = self._norm(x.float()).type_as(x)
        return output * self.weight

class FeedForward(nn.Module):
    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
        super().__init__()
        if hidden_dim is None:
            hidden_dim = 4 * dim
            hidden_dim = int(2 * hidden_dim / 3)
            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
        self.dropout = nn.Dropout(dropout)

    def SwiGLU(self, x: torch.Tensor) -> torch.Tensor:
        '''
        Compute the SwiGLU activation function (see Section 2 in
        https://arxiv.org/abs/2204.02311
        '''
        return F.silu(self.w1(x)) * self.w3(x)

    def forward(self, x):
        return self.dropout(self.w2(self.SwiGLU(x)))


class CausalSelfAttention(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        assert config.n_embd % config.n_heads == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_heads
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.rotary = RotaryEmbedding(config.rope_dim)
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # apply RoPE, see https://arxiv.org/abs/2104.09864
        k = self.rotary.rotate_queries_or_keys(k)
        q = self.rotary.rotate_queries_or_keys(q)


        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.rn_1 = RMSNorm(config.n_embd, eps=config.layer_norm_eps)
        self.attn = CausalSelfAttention(config)
        self.rn_2 = RMSNorm(config.n_embd, eps=config.layer_norm_eps)
        self.mlp = FeedForward(config.n_embd, config.hidden_dim, config.multiple_of, config.dropout)

    def forward(self, x):
        x = x + self.attn(self.rn_1(x))
        x = x + self.mlp(self.rn_2(x))
        return x

class GPT(nn.Module):

    def __init__(self, config: ModelConfig):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layers)]),
            ln_f = RMSNorm(config.n_embd, eps=config.layer_norm_eps)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layers))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        x = self.transformer.drop(tok_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -config.block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
# model = GPT(n_layer= config.n_layers, n_head=config.n_heads, n_embd=config.n_embd, block_size=config.block_size,
#                   bias=config.bias, dropout= config.dropout)
# m= model.to(device)

In [27]:
model = GPT(config)
m = model.to(device)

In [28]:
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

152.749312 M parameters


In [29]:
checkpoint = torch.load("/content/pretrained_model/checkpoint_iter_3600_3_99_v3.pth")

model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [30]:
def save_checkpoint(model, optimizer, iter_num, filename='checkpoint.pth'):
    torch.save({
        'iter_num': iter_num,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, filename)
    print(f"Checkpoint saved at iteration {iter_num}")

In [31]:
iter_num = 0
best_val_loss = 1e9

# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# scaler = torch.cuda.amp.GradScaler()

# optimizer
# changed_lr = 6e-5
optimizer = model.configure_optimizers(config.weight_decay, config.learning_rate, config.betas, device_type=device)
# optimizer = model.configure_optimizers(config.weight_decay, changed_lr, config.betas, device_type=device)
checkpoint = None

unoptimized_model = model

num decayed parameter tensors: 41, with 152,698,880 parameters
num non-decayed parameter tensors: 33, with 50,176 parameters
using fused AdamW: True


In [32]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(config.eval_iters)
        for k in range(config.eval_iters):
            X, Y = get_batch(split)
            # with ctx:
                # logits, loss = model(X, Y)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [33]:
# learning rate scheduler with decay and linear warmup according to the GPT paper.
def get_lr(it):
    if it < config.warmup_iters:
        return config.learning_rate * it / config.warmup_iters
    if it > config.lr_decay_iters:
        return config.min_lr
    decay_ratio = (it - config.warmup_iters) / (config.lr_decay_iters - config.warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return config.min_lr + coeff * (config.learning_rate - config.min_lr)

In [34]:
# logging
if config.wandb_log:
    import wandb
    wandb.login(key="197d96ebfe1ad37dfd2180d901ca0f779e76bdfe")
    wandb.init(project=config.wandb_project, name=config.wandb_run_name, config=config)



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [35]:
torch.cuda.empty_cache()
gc.collect()

2484

In [None]:
# training loop
X, Y = get_batch('train') # fetch the very first batch
t0 = time.time()
local_iter_num = 0 # number of iterations in the lifetime of this process
raw_model = model

# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.8, patience=2, verbose=True)

while True:
    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if config.decay_lr else config.learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % config.eval_interval == 0:
        losses = estimate_loss()
        current_lr = optimizer.param_groups[0]['lr']
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        # Wandb logging
        # scheduler.step(losses['val'])
        if config.wandb_log:
            wandb.log({
                "iter": iter_num,
                "train/loss": losses['train'],
                "val/loss": losses['val'],
                "lr": current_lr,
            })
    if iter_num == 0 and config.eval_only:
        break

    for micro_step in range(config.gradient_accumulation_steps):
        # with ctx:
        #     logits, loss = model(X, Y)
        #     loss = loss / config.gradient_accumulation_steps
        logits, loss = model(X, Y)
        loss = loss / config.gradient_accumulation_steps

        X, Y = get_batch('train')
        scaler.scale(loss).backward()

    # Gradient Clipping
    if config.grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)

    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)

    if iter_num % config.save_checkpoint_iters == 0:
      save_checkpoint(model, optimizer, iter_num, filename=os.path.join(out_dir, f'checkpoint_iter_{iter_num}.pth'))


    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % config.log_interval == 0:
        current_lr = optimizer.param_groups[0]['lr']
        lossf = loss.item() * config.gradient_accumulation_steps
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, lr {current_lr:.6f}")
    iter_num += 1
    local_iter_num += 1

    if iter_num > config.max_iters:
        break

In [37]:
enc = tiktoken.get_encoding("gpt2")

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_tokens = model.generate(context, max_new_tokens=2000)[0].tolist()


In [38]:
generated_text = enc.decode(generated_tokens)
print(f"Generated text at step {iter_num}: {generated_text}")

Generated text at step 0: !” immediately, but that’s the worst so far I want it to lose grip. Acquiring anymore is no less work than under winner!

Lastly, had you been writing this below via the Future’s website of Williams’s 10 Days 100, but have you devour Little Right! Could you man for “Termin innuindling” ” blends In The Pistols On the Machine? Send it with your favorite experience!

Food Many rejection rides happening…

 Property Degree: The movement ofIAL agreements between more and more female mocked stecs

A friend at an oiances for aは and cs would begin to enjoy some tabs before spending time. I love widespread, emotional content, and at times have invested in how to lend our course of 21-hour work together. This is, however, a remarkable crop death group in Project Eful lineup. With my followers, propaganda, and participation time, there was a completely different globe, from the amazing World War II Show! We live in the midst of new time, though we speak for ourselves, jus