In [3]:
import argparse
import torch
from transformers import AutoModelForCausalLM, default_data_collator, AutoTokenizer
from peft import LoraConfig, get_peft_model
from accelerate import Accelerator
from accelerate.utils import GradScalerKwargs
import wandb
from torch.utils.data import DataLoader, Dataset
import random
import numpy as np
from typing import Optional

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Helper functions

def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params


def seed_all(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

## Get the training code

In [6]:
!git clone https://github.com/EfficientLLMs/lazy-pretrain.git

Cloning into 'lazy-pretrain'...
remote: Enumerating objects: 1035, done.[K
remote: Counting objects: 100% (493/493), done.[K
remote: Compressing objects: 100% (193/193), done.[K
remote: Total 1035 (delta 405), reused 381 (delta 299), pack-reused 542 (from 1)[K
Receiving objects: 100% (1035/1035), 35.89 MiB | 18.02 MiB/s, done.
Resolving deltas: 100% (785/785), done.


In [7]:
!git pull

remote: Enumerating objects: 17, done.[K
remote: Counting objects:   5% (1/17)[Kremote: Counting objects:  11% (2/17)[Kremote: Counting objects:  17% (3/17)[Kremote: Counting objects:  23% (4/17)[Kremote: Counting objects:  29% (5/17)[Kremote: Counting objects:  35% (6/17)[Kremote: Counting objects:  41% (7/17)[Kremote: Counting objects:  47% (8/17)[Kremote: Counting objects:  52% (9/17)[Kremote: Counting objects:  58% (10/17)[Kremote: Counting objects:  64% (11/17)[Kremote: Counting objects:  70% (12/17)[Kremote: Counting objects:  76% (13/17)[Kremote: Counting objects:  82% (14/17)[Kremote: Counting objects:  88% (15/17)[Kremote: Counting objects:  94% (16/17)[Kremote: Counting objects: 100% (17/17)[Kremote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects:  50% (1/2)[Kremote: Compressing objects: 100% (2/2)[Kremote: Compressing objects: 100% (2/2), done.[K
remote: Total 9 (delta 7), reused 9 (delta 7), pack-reused 0 (from 0

In [1]:
%cd lazy-pretrain/

/home/vmasti/efficient-llms-capstone/lazy-pretrain


## Download and use the pre-tokenized data with the same order
EleutherAI has provided a pre-tokenized version of the standard (duplicated) pile dataset, which is also Pythia pre-shuffled. The dataset contains only token_ids. [link](https://huggingface.co/datasets/EleutherAI/pile-standard-pythia-preshuffled/tree/main)

The whole dataset has about 300B tokens. `00.bin` to `19.bin` are about 30GB large each. The last one `20.bin` is only 78.3MB. We can download only the last one.


1. Clone the repository without downloading


In [14]:
!mkdir data && cd data && GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/datasets/EleutherAI/pile-standard-pythia-preshuffled

Cloning into 'pile-standard-pythia-preshuffled'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 30 (delta 1), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (30/30), 4.99 KiB | 852.00 KiB/s, done.


2. Download only the last file, which has 39_168_000 tokens


In [15]:
!cd data/pile-standard-pythia-preshuffled && git lfs pull --include="document-00020-of-00020.bin"



3. Get all the tokens from the last file. We can use it to get the last 5M tokens.

In [5]:
filename = "data/pile-standard-pythia-preshuffled/document-00020-of-00020.bin"
tokens = np.memmap(filename, dtype=np.uint16)
len(tokens)

39168000

In [13]:
!pip install ai2-olmo



## Grow the 400m model to 1.4b

In [6]:
!python src/grow/grow.py \
    --small_model "pythia-410m" \
    --large_depth 24 \
    --large_width 2048 \
    --depth_growth "alternate" \
    --attn_heads 16 \
    --output_dir "models/pythia-410m-to-pythia-1.4b"

Original model: 24 layers, 1024 width
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Finish the following sentence:
Raindrops on roses,

A:

I think

Expanding model of 1024 width to 2048 width
attention ratio: 1.0
Grown model: 24 layers, 2048 width
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Finish the following sentence:
Raindrops on roses,

A:

I think

Grown model config: GPTNeoXConfig {
  "_name_or_path": "EleutherAI/pythia-410m-expand-width-2048",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 32,
  "num_hidden_layers": 24,
  "partial_rotary_factor": 0.25,
  "rop

# Pre-training a 1.4B model with limited compute using a 410m model

In [8]:
# Train

!python src/pretrain/pretrain_lora.py \
    --grown_model "models/pythia-410m-to-pythia-1.4b" \
    --tokenizer "EleutherAI/pythia-70m" \
    --seed 1234 \
    --rank 256 \
    --lora_alpha 256 \
    --batch_size 32 \
    --lr 1e-5 \
    --output_dir "models/pythia-410m-to-pythia-1.4b-lora-10m" \
    --dataset 'pile' \
    --num_tokens 10_000_000 \
    --chunk_size 1024 \
    --use_on_the_fly \
    --first_idx 19 \
    --last_idx 20 \
    --wandb_entity vibhamasti \
    --wandb_run_name "pythia-410-1.4b-10m-tokens"


device: cuda

Training configuration:
Total batches: 306
Number of GPUs: 1
Steps per GPU: 306
Total steps: 306
[34m[1mwandb[0m: Currently logged in as: [33mvibhamasti[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.19.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/vmasti/efficient-llms-capstone/lazy-pretrain/wandb/run-20250515_011333-ixiw0xmc[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mpythia-410-1.4b-10m-tokens[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/vibhamasti/lora-pretraining[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/vibhamasti/lora-pretraining/runs/ixiw0xmc/workspace[0m
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:05<00:00,  2.61s/it]
M