In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install textstat evaluate

In [None]:
!pip install rouge_score

In [None]:
import torch
import torch.nn as nn
from datasets import load_dataset, Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler, DataCollatorWithPadding, AutoConfig
from tqdm import tqdm
from torch.optim import AdamW
import os
import json
import nltk
import re
import textstat
from datasets import Dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
import evaluate
import random

In [None]:
CONFIG = {
    "SEED": 42,
    "BLOCK_SIZE": 1024,
    "BATCH_SIZE": 2,
    "LEARNING_RATE": 4e-4,
    "MAX_EPOCHS": 10,
    "ACCUMULATION_STEPS": 4,
    "DEVICE": torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    "CHECKPOINT_DIR_BASE": "/kaggle/working/model_runs",
    "DIR_WEIGHT": 5e-7
}

def set_seed(seed_value):
    """Sets the seed for all relevant random number generators."""
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)
    print(f"Seed set to {seed_value}")

In [None]:
set_seed(CONFIG["SEED"])

In [None]:
train_groups = []
val_groups = []
test_groups = []
with open("/kaggle/input/babylm/train_dataset.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        train_groups.append(json.loads(line))

with open("/kaggle/input/babylm/val_dataset.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        val_groups.append(json.loads(line))

with open("/kaggle/input/babylm/test_dataset.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        test_groups.append(json.loads(line))

In [None]:
nltk.download('punkt')

FILEPATHS = {
    "childes":       "/kaggle/input/train-10m/train_10M/childes.train",
    "gutenberg":     "/kaggle/input/train-10m/train_10M/gutenberg.train",
    "open_subtitles":"/kaggle/input/train-10m/train_10M/open_subtitles.train",
    "simple_wiki":   "/kaggle/input/train-10m/train_10M/simple_wiki.train",
    "switchboard":   "/kaggle/input/train-10m/train_10M/switchboard.train",
    "bnc_spoken":    "/kaggle/input/train-10m/train_10M/bnc_spoken.train",
}
TOKENIZER_NAME = "HuggingFaceTB/SmolLM2-135M"
BLOCK_SIZE = 1024
BATCH_SIZE = 2

In [None]:
from tqdm import tqdm
import copy

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

eos_id = tokenizer.eos_token_id  USE_EOS = eos_id is not None

def pack_paragraph_sentences(paragraph_text, tokenizer, block_size, use_eos=True, eos_id=None):
    """Turn a paragraph into a list of blocks (list of token-id lists)."""
    sents = nltk.tokenize.sent_tokenize(paragraph_text)
    blocks = []
    cur = []

    for s in sents:
        # leading space helps GPT-2 style tokenizers keep token boundaries
        ids = tokenizer(" " + s, add_special_tokens=False).input_ids

        # Mark true sentence end with eos (only at actual sentence end)
        if use_eos and eos_id is not None:
            ids = ids + [eos_id]

        # If single sentence (with optional eos) is longer than block -> chunk it
        if len(ids) > block_size:
            # flush current block first
            if cur:
                blocks.append(cur)
                cur = []

            # chunk this long sentence; EOS only kept on final chunk
            for i in range(0, len(ids), block_size):
                chunk = ids[i : i + block_size]
                # if we are chunking and the original ended with eos, ensure final chunk still has it
                blocks.append(chunk)
            continue

        # normal packing
        if len(cur) + len(ids) <= block_size:
            cur.extend(ids)
        else:
            # finish current block and start a new one
            blocks.append(cur)
            cur = ids.copy()

    if cur:
        blocks.append(cur)
    return blocks

# Build all blocks from train_groups
blocks = []
for grp in tqdm(train_groups, desc="Tokenizing groups"):
    text = grp["text"]
    # split into paragraphs to avoid joining separate documents
    paras = text.split("\n\n")
    for p in paras:
        if not p.strip():
            continue
        blocks.extend(pack_paragraph_sentences(p, tokenizer, BLOCK_SIZE,
                                              use_eos=USE_EOS, eos_id=eos_id))

# sanity filters: drop empty blocks and enforce max length
blocks = [b for b in blocks if len(b) > 0]
assert all(1 <= len(b) <= BLOCK_SIZE for b in blocks)

# create dataset dict (use copies so later modifications won't alias accidentally)
dataset_dict = {
    "input_ids": [b.copy() for b in blocks],
    "attention_mask": [[1] * len(b) for b in blocks],
    "labels": [b.copy() for b in blocks],
}

print(f"Total train blocks: {len(blocks)}")
train_dataset = Dataset.from_dict(dataset_dict)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
blocks = []
for grp in tqdm(val_groups, desc="Tokenizing groups"):
    text = grp["text"]
    # split into paragraphs to avoid joining separate documents
    paras = text.split("\n\n")
    for p in paras:
        if not p.strip():
            continue
        blocks.extend(pack_paragraph_sentences(p, tokenizer, BLOCK_SIZE,
                                              use_eos=USE_EOS, eos_id=eos_id))

# sanity filters: drop empty blocks and enforce max length
blocks = [b for b in blocks if len(b) > 0]
assert all(1 <= len(b) <= BLOCK_SIZE for b in blocks)



# prepare dataset dict
dataset_dict = {
    "input_ids": blocks,
    "attention_mask": [[1]*len(b) for b in blocks],
    "labels": blocks.copy(),
}

print(f"Total val blocks: {len(blocks)}")

val_dataset = Dataset.from_dict(dataset_dict)
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


blocks = []
for grp in tqdm(test_groups, desc="Tokenizing groups"):
    text = grp["text"]
    # split into paragraphs to avoid joining separate documents
    paras = text.split("\n\n")
    for p in paras:
        if not p.strip():
            continue
        blocks.extend(pack_paragraph_sentences(p, tokenizer, BLOCK_SIZE,
                                              use_eos=USE_EOS, eos_id=eos_id))

# sanity filters: drop empty blocks and enforce max length
blocks = [b for b in blocks if len(b) > 0]
assert all(1 <= len(b) <= BLOCK_SIZE for b in blocks)



# prepare dataset dict
dataset_dict = {
    "input_ids": blocks,
    "attention_mask": [[1]*len(b) for b in blocks],
    "labels": blocks.copy(),
}

print(f"Total test blocks: {len(blocks)}")

test_dataset = Dataset.from_dict(dataset_dict)
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM

class FullDiffWrapper(nn.Module):
    def __init__(self, cfg):
        """
        Wraps a causal LM so that block[layer_idx] applies:
            new_hidden = alpha * cur + beta * (cur - prev)
        inside its own forward().

        Args:
          base_model_name: name/path of your HF causal LM
          layer_idx:        index of the transformer block to patch
        """
        super().__init__()
        # 1) Load the pretrained LM
        # cfg  = AutoConfig.from_pretrained(base_model_name)
        self.base = AutoModelForCausalLM.from_config(cfg)

        for i, block in enumerate(self.base.transformer.h):
    
            # 3) Create trainable parameters alpha & beta on that block
            dtype = block.ln_1.weight.dtype
            device = block.ln_1.weight.device
            block.alpha = nn.Parameter(torch.randn(1, dtype=dtype, device=device) * 0.02 + 1.0)
            
            # Initialize beta and gamma near 0.0 with small random noise
            block.beta  = nn.Parameter(torch.randn(1, dtype=dtype, device=device) * 0.02)
            block.gamma = nn.Parameter(torch.randn(1, dtype=dtype, device=device) * 0.02)
    
            # 4) Keep the original block.forward
            orig_forward = block.forward
    
            def make_patched(orig_forward, alpha, beta, gamma):
            # 5) Define a patched forward that inlines your transform
                def patched_forward(hidden_states, *args, **kwargs):
                    # run original block
                    outputs = orig_forward(hidden_states, *args, **kwargs)
                    cur = outputs[0]    # (B, T, D)
        
                    # build h_{t-1}    by right-shift with zero pad
                    zeros1 = torch.zeros_like(cur[:, :1, :])
                    prev   = torch.cat([zeros1, cur[:, :-1, :]], dim=1)
        
                    # build h_{t-2}    by two-step right-shift
                    zeros2 = torch.zeros_like(cur[:, :2, :])
                    prev2  = torch.cat([zeros2, cur[:, :-2, :]], dim=1)
        
                    # first difference Δh_t = cur - prev
                    vel = cur - prev
        
                    # second difference Δ²h_t = (cur - prev) - (prev - prev2)
                    acc = vel - (prev - prev2)
        
                    # mix them
                    new_hidden = (
                        alpha * cur
                      + beta  * vel
                      + gamma * acc
                    )
        
                    # return same tuple shape as original
                    return (new_hidden, *outputs[1:])
                return patched_forward
            # 6) Monkey-patch the block’s forward method
            block.forward = make_patched(orig_forward, block.alpha, block.beta, block.gamma)

    def forward(self, input_ids=None, inputs_embeds=None, **kwargs):
        # Simply delegate to the base LM; your patch lives inside one block’s forward
        return self.base(input_ids=input_ids,
                         inputs_embeds=inputs_embeds,
                         **kwargs)

In [None]:
from transformers import GPT2Config
VOCAB_SIZE = len(tokenizer) 

smollm_config = GPT2Config(
    vocab_size=VOCAB_SIZE,  # Use the true vocabulary size
    n_positions=BLOCK_SIZE,   # CORRECT: Match the max sequence length from your data
    n_embd=192,
    n_layer=6,
    n_head=6,
)

In [None]:
newmodel = FullDiffWrapper(smollm_config)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    newmodel.base.resize_token_embeddings(len(tokenizer))

In [None]:
len(newmodel.base.transformer.h)

In [None]:
class DataCollatorWithPaddingAndLabels(DataCollatorWithPadding):
    def __call__(self, features):
        # Extract labels
        labels = [torch.tensor(f["labels"]) for f in features]

        # Remove labels before passing to super()
        features_for_pad = [{k: v for k, v in f.items() if k != "labels"} for f in features]

        # Pad input_ids + attention_mask
        batch = super().__call__(features_for_pad)

        # Pad labels separately
        batch["labels"] = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=-100  # ignore index for loss
        )
        return batch 


data_collator = DataCollatorWithPaddingAndLabels(tokenizer=tokenizer, return_tensors="pt")

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,        # <<< spawn 4 workers in parallel
    pin_memory=True,       # <<< page‐lock your tensors for faster CUDA copie
    collate_fn=data_collator)
val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=4,        # <<< spawn 4 workers in parallel
    pin_memory=True,       # <<< page‐lock your tensors for faster CUDA copie 
    collate_fn=data_collator)

In [None]:
def evaluate_on_val(model, val_loader, device):
    model.eval()
    total_loss = 0
    total_tokens = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            active_tokens = (labels != -100).sum().item()
            total_loss += loss.item() * active_tokens
            total_tokens += active_tokens


    avg_loss = total_loss / (total_tokens + 1e-12)
    perplexity = torch.exp(torch.tensor(avg_loss))
    print(f"Validation loss: {avg_loss:.4f}, Perplexity: {perplexity:.4f}")
    return avg_loss, perplexity.item()

In [None]:
def save_checkpoint(save_dir, model, tokenizer, optimizer, scheduler, scaler, epoch, step, best_val=None, keep=5):
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(os.path.join(save_dir, "hf_base"), exist_ok=True)
    # Save model + tokenizer only for best/epoch/final (optional)
    model.base.save_pretrained(os.path.join(save_dir, "hf_base"))
    tokenizer.save_pretrained(os.path.join(save_dir, "hf_base"))
    diff = {k:v for k,v in model.state_dict().items() if any(x in k for x in ("alpha","beta","gamma"))}

    ckpt = {
        "diff":diff,
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict() ,
        "scaler": scaler.state_dict(),
        "epoch": epoch,
        "step": step,
        "best_val": best_val,
        "rng": {
            "torch": torch.get_rng_state(),
            "cuda": torch.cuda.get_rng_state_all() if torch.cuda.is_available() else None,
            "numpy": np.random.get_state(),
            "random": random.getstate(),
        }
    }
    fname = f"ckpt_step_{step:07d}.pt"
    tmp_path = os.path.join(save_dir, "tmp_" + fname)
    final_path = os.path.join(save_dir, fname)
    torch.save(ckpt, tmp_path)
    os.replace(tmp_path, final_path)
    print(f"Saved checkpoint {final_path}")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from torch.cuda.amp import autocast, GradScaler
EVALUATE_EVERY_N_STEPS = 5000
CHECKPOINT_DIR = "/kaggle/working/my_model_checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

special_tokens = {'pad_token': '[PAD]'}
num_added = tokenizer.add_special_tokens(special_tokens)
if num_added > 0:
    newmodel.base.resize_token_embeddings(len(tokenizer))

print("Tokenizer vocab size:", len(tokenizer))
print("Embedding weight shape:", newmodel.base.get_input_embeddings().weight.shape)

newmodel.float()
newmodel.to(device)
max_epochs = 10
ACCUMULATION_STEPS = 4
steps_per_epoch = len(train_loader)
total_steps = int(steps_per_epoch * max_epochs)

optimizer = AdamW(newmodel.parameters(), lr=4e-4)
scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)
scaler = GradScaler()
step = 0
best_val = float('inf')

# ===================================================================
# 2. INJECTED REGULARIZATION HYPERPARAMETERS
# ===================================================================
## <<< New hyperparameter for the acceleration penalty
accel_weight = 0.1 # This is your lambda_coeff, the main knob to tune

# Define the layer weights for your 6-layer model
layer_weights = torch.tensor([1.5, 1.3, 1.1, 1.0, 0.8, 0.6], device=device)


# ===================================================================
# 3. MODIFIED TRAINING LOOP
# ===================================================================
try:
    optimizer.zero_grad()
    for epoch in range(max_epochs):
        newmodel.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        for batch in pbar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with autocast(dtype=torch.float16):
                # Ensure you get hidden states from the model
                outputs = newmodel(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    output_hidden_states=True ## <<< Must be True
                )
                
                # --- Start of Injected Logic ---
                
                # Cast main loss to float32 for stable combination
                main_loss = outputs.loss.float()
                
                accel_loss = torch.tensor(0.0, device=device)
                
                # Calculate acceleration penalty only if weight > 0 and hidden states are available
                if accel_weight > 0 and outputs.hidden_states:
                    # The first hidden state is the embedding, so we skip it
                    transformer_layer_outputs = outputs.hidden_states[1:]
                    
                    total_weighted_penalty = torch.tensor(0.0, device=device)
                    
                    for i, hidden_states_for_layer in enumerate(transformer_layer_outputs):
                        # Cast to float32 for stable norm calculation
                        hidden = hidden_states_for_layer.float()

                        # Need at least 3 tokens to calculate acceleration
                        if hidden.shape[1] < 3:
                            continue

                        e_t = hidden[:, :-2, :]
                        e_t_plus_1 = hidden[:, 1:-1, :]
                        e_t_plus_2 = hidden[:, 2:, :]
                        
                        acceleration = e_t_plus_2 - 2 * e_t_plus_1 + e_t
                        
                        # Calculate the squared L2 norm for each token's acceleration vector
                        # and then take the mean over the batch and sequence
                        layer_penalty = torch.mean(torch.norm(acceleration, p=2, dim=2)**2)
                        
                        total_weighted_penalty += layer_weights[i] * layer_penalty
                    
                    # Normalize to get a weighted average
                    if layer_weights.sum() > 0:
                        accel_loss = total_weighted_penalty / layer_weights.sum()

                # Combine the losses
                total_batch_loss = main_loss + accel_weight * accel_loss
                
                # --- End of Injected Logic ---

                loss_to_backward = total_batch_loss / ACCUMULATION_STEPS

            scaler.scale(loss_to_backward).backward()
            
            if (step + 1) % ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
                
            step += 1
            pbar.set_postfix(loss=main_loss.item(), accel_loss=accel_loss.item(), lr=scheduler.get_last_lr()[0])

            if step % EVALUATE_EVERY_N_STEPS == 0:
                avg_loss, perplexity = evaluate_on_val(newmodel, val_loader, device)

        # End of epoch cleanup and evaluation
        if step % ACCUMULATION_STEPS != 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
            
        avg_loss, perplexity = evaluate_on_val(newmodel, val_loader, device)
        if avg_loss < best_val:
            best_val = avg_loss
        save_checkpoint(CHECKPOINT_DIR, newmodel, tokenizer, optimizer, scheduler, scaler, epoch, step, best_val)
        print(f"Saved epoch {epoch + 1}")

finally:
    save_checkpoint(CHECKPOINT_DIR, newmodel, tokenizer, optimizer, scheduler, scaler, epoch, step, best_val)
    print(f"Training complete.")


In [None]:
!wget -O evaluation_data.zip "https://files.osf.io/v1/resources/ryjfm/providers/osfstorage/6819f54f5dc6fc2bff0a7bba/?zip="

# --- 3. Unzip the data ---
!unzip evaluation_data.zip

In [None]:
!mv /kaggle/working/index.html?zip= /kaggle/working/evaluation_data.zip

In [None]:
mkdir ./evaluation_data

In [None]:
!mv /kaggle/working/full_eval /kaggle/working/evaluation_data/full_eval
!mv /kaggle/working/fast_eval /kaggle/working/evaluation_data/fast_eval

In [None]:
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator)

In [None]:
!git clone https://github.com/babylm/evaluation-pipeline-2025.git

In [None]:
!PYTHONPATH="/kaggle/working/evaluation-pipeline-2025" \
/kaggle/working/evaluation-pipeline-2025/eval_zero_shot_fast.sh \
/kaggle/working/my_model_checkpoints/hf_base \
v4 \
causal

In [None]:
import math
import torch
from tqdm import tqdm

bleu_metric  = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def evaluate_model(model, test_dataloader, device):
    model.eval()
    model.to(device)

    total_loss = 0
    num_tokens  = 0

    generated_texts = []
    reference_texts = []

    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels         = batch["labels"].to(device)

            # 1) forward + loss
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
            loss = outputs.loss
            total_loss += loss.item() * labels.numel()
            num_tokens  += labels.numel()

            # 2) generation
            gen_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=labels.shape[1] + 1
            )
            decoded_preds = tokenizer.batch_decode(
                gen_ids, skip_special_tokens=True)

            # 3) decode labels: replace -100 → pad_id, then decode
            labels_for_decode = labels.clone()
            labels_for_decode[labels_for_decode == -100] = pad_id
            decoded_labels = tokenizer.batch_decode(
                labels_for_decode, skip_special_tokens=True)

            generated_texts.extend(decoded_preds)
            reference_texts.extend(decoded_labels)

    # ---- Metrics ----
    ppl = math.exp(total_loss / num_tokens)
    bleu  = bleu_metric.compute(
        predictions=generated_texts,
        references=[[r] for r in reference_texts]
    )["bleu"]
    rouge = rouge_metric.compute(
        predictions=generated_texts,
        references=reference_texts
    )

    print(f"\nPerplexity: {ppl:.4f}")
    print(f"BLEU:       {bleu:.4f}")
    print(f"ROUGE:      {rouge}")

    return {"perplexity": ppl, "bleu": bleu, "rouge": rouge}

In [None]:
evaluate_model(newmodel.base, test_loader, device)

# without reg

In [None]:
del newmodel

In [None]:
set_seed(CONFIG["SEED"])

In [None]:
newmodel = FullDiffWrapper(smollm_config)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    newmodel.base.resize_token_embeddings(len(tokenizer))

In [None]:
class DataCollatorWithPaddingAndLabels(DataCollatorWithPadding):
    def __call__(self, features):
        # Extract labels
        labels = [torch.tensor(f["labels"]) for f in features]

        # Remove labels before passing to super()
        features_for_pad = [{k: v for k, v in f.items() if k != "labels"} for f in features]

        # Pad input_ids + attention_mask
        batch = super().__call__(features_for_pad)

        # Pad labels separately
        batch["labels"] = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=-100  # ignore index for loss
        )
        return batch 


data_collator = DataCollatorWithPaddingAndLabels(tokenizer=tokenizer, return_tensors="pt")

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,        # <<< spawn 4 workers in parallel
    pin_memory=True,       # <<< page‐lock your tensors for faster CUDA copie
    collate_fn=data_collator)
val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=4,        # <<< spawn 4 workers in parallel
    pin_memory=True,       # <<< page‐lock your tensors for faster CUDA copie 
    collate_fn=data_collator)


In [None]:
# ------------------------
# Model Setup
# ------------------------
from torch.cuda.amp import autocast, GradScaler
EVALUATE_EVERY_N_STEPS = 5000
CHECKPOINT_DIR = "/kaggle/working/withoutreg"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

special_tokens = {'pad_token': '[PAD]'}
num_added = tokenizer.add_special_tokens(special_tokens)
if num_added > 0:
    newmodel.base.resize_token_embeddings(len(tokenizer))

print("Tokenizer vocab size:", len(tokenizer))
print("Embedding weight shape:", newmodel.base.get_input_embeddings().weight.shape)

# newmodel = torch.compile(newmodel)
newmodel.float()                      # ensure FP32
newmodel.to(device)
steps_per_epoch = len(train_loader)
total_steps = int(steps_per_epoch * max_epochs)

optimizer = AdamW(newmodel.parameters(), lr=4e-4)
scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)
scaler = GradScaler()
step = 0
best_val = float('inf')
try:
    optimizer.zero_grad()
    for epoch in range(max_epochs):
        newmodel.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        for batch in pbar:
            # if step > 5002:
            #     break
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            with autocast(dtype=torch.float16):
                outputs = newmodel(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss / ACCUMULATION_STEPS
            scaler.scale(loss).backward()            
            if (step + 1) % ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)  # Optimizer step with scaled gradients
                scaler.update()         # Update scaler
                scheduler.step()
                optimizer.zero_grad()
            step += 1
            pbar.set_postfix(loss=outputs.loss.item(), lr=scheduler.get_last_lr()[0])

            if step % EVALUATE_EVERY_N_STEPS == 0:
                avg_loss, perplexity = evaluate_on_val(newmodel, val_loader, device)

        if step % ACCUMULATION_STEPS != 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
        avg_loss, perplexity = evaluate_on_val(newmodel, val_loader, device)
        if avg_loss < best_val:
            best_val = avg_loss
        save_checkpoint(CHECKPOINT_DIR, newmodel, tokenizer, optimizer, scheduler, scaler, epoch, step, best_val)
        print(f"Saved epoch {epoch + 1}")

finally:
    save_checkpoint(CHECKPOINT_DIR, newmodel, tokenizer, optimizer, scheduler, scaler, epoch, step, best_val)
    print(f"Training complete.")

In [None]:
set_seed(CONFIG["SEED"])

In [None]:
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator)

In [None]:
set_seed(CONFIG["SEED"])

In [None]:
evaluate_model(newmodel.base, test_loader, device)

In [None]:
!PYTHONPATH="/kaggle/working/evaluation-pipeline-2025" \
/kaggle/working/evaluation-pipeline-2025/eval_zero_shot_fast.sh \
/kaggle/working/withoutreg/hf_base \
v4 \
causal

# base model

In [None]:
del newmodel

In [None]:
set_seed(CONFIG["SEED"])

In [None]:
newmodel = AutoModelForCausalLM.from_config(smollm_config)
newmodel.resize_token_embeddings(len(tokenizer))

In [None]:
set_seed(CONFIG["SEED"])

In [None]:
class DataCollatorWithPaddingAndLabels(DataCollatorWithPadding):
    def __call__(self, features):
        # Extract labels
        labels = [torch.tensor(f["labels"]) for f in features]

        # Remove labels before passing to super()
        features_for_pad = [{k: v for k, v in f.items() if k != "labels"} for f in features]

        # Pad input_ids + attention_mask
        batch = super().__call__(features_for_pad)

        # Pad labels separately
        batch["labels"] = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=-100  # ignore index for loss
        )
        return batch 


data_collator = DataCollatorWithPaddingAndLabels(tokenizer=tokenizer, return_tensors="pt")

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,        # <<< spawn 4 workers in parallel
    pin_memory=True,       # <<< page‐lock your tensors for faster CUDA copie
    collate_fn=data_collator)
val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=4,        # <<< spawn 4 workers in parallel
    pin_memory=True,       # <<< page‐lock your tensors for faster CUDA copie 
    collate_fn=data_collator)


In [None]:
def save_checkpoint(save_dir, model, tokenizer, optimizer, scheduler, scaler, epoch, step, best_val=None, keep=5):
    os.makedirs(save_dir, exist_ok=True)
    os.makedirs(os.path.join(save_dir, "hf_base"), exist_ok=True)
    # Save model + tokenizer only for best/epoch/final (optional)
    model.save_pretrained(os.path.join(save_dir, "hf_base"))
    tokenizer.save_pretrained(os.path.join(save_dir, "hf_base"))
    diff = {k:v for k,v in model.state_dict().items() if any(x in k for x in ("alpha","beta","gamma"))}

    ckpt = {
        "diff":diff,
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict() ,
        "scaler": scaler.state_dict(),
        "epoch": epoch,
        "step": step,
        "best_val": best_val,
        "rng": {
            "torch": torch.get_rng_state(),
            "cuda": torch.cuda.get_rng_state_all() if torch.cuda.is_available() else None,
            "numpy": np.random.get_state(),
            "random": random.getstate(),
        }
    }
    fname = f"ckpt_step_{step:07d}.pt"
    tmp_path = os.path.join(save_dir, "tmp_" + fname)
    final_path = os.path.join(save_dir, fname)
    torch.save(ckpt, tmp_path)
    os.replace(tmp_path, final_path)

    # prune older checkpoints
    # def ckpt_num(f):
    #     import re
    #     m = re.search(r"ckpt_step_(\d+)\.pt", f)
    #     return int(m.group(1)) if m else -1
    # all_ckpts = sorted([p for p in os.listdir(save_dir) if p.startswith("ckpt_step_")], key=ckpt_num)
    # if len(all_ckpts) > keep:
    #     for old in all_ckpts[:-keep]:
    #         try:
    #             os.remove(os.path.join(save_dir, old))
    #         except OSError:
    #             pass
    print(f"Saved checkpoint {final_path}")

In [None]:
from torch.cuda.amp import autocast, GradScaler
EVALUATE_EVERY_N_STEPS = 5000
CHECKPOINT_DIR = "/kaggle/working/baseline"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

special_tokens = {'pad_token': '[PAD]'}
# num_added = tokenizer.add_special_tokens(special_tokens)
# if num_added > 0:
#     newmodel.base.resize_token_embeddings(len(tokenizer))

print("Tokenizer vocab size:", len(tokenizer))
print("Embedding weight shape:", newmodel.get_input_embeddings().weight.shape)

# newmodel = torch.compile(newmodel)
newmodel.float()                      # ensure FP32
newmodel.to(device)
steps_per_epoch = len(train_loader)
total_steps = int(steps_per_epoch * max_epochs)

optimizer = AdamW(newmodel.parameters(), lr=4e-4)
scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)
scaler = GradScaler()
step = 0
best_val = float('inf')
try:
    optimizer.zero_grad()
    for epoch in range(max_epochs):
        newmodel.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        for batch in pbar:
            # if step > 5002:
            #     break
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            with autocast(dtype=torch.float16):
                outputs = newmodel(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss / ACCUMULATION_STEPS
            scaler.scale(loss).backward()            
            if (step + 1) % ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)  # Optimizer step with scaled gradients
                scaler.update()         # Update scaler
                scheduler.step()
                optimizer.zero_grad()
            step += 1
            pbar.set_postfix(loss=outputs.loss.item(), lr=scheduler.get_last_lr()[0])

            if step % EVALUATE_EVERY_N_STEPS == 0:
                avg_loss, perplexity = evaluate_on_val(newmodel, val_loader, device)

        if step % ACCUMULATION_STEPS != 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
        avg_loss, perplexity = evaluate_on_val(newmodel, val_loader, device)
        if avg_loss < best_val:
            best_val = avg_loss
        save_checkpoint(CHECKPOINT_DIR, newmodel, tokenizer, optimizer, scheduler, scaler, epoch, step, best_val)
        print(f"Saved epoch {epoch + 1}")

finally:
    save_checkpoint(CHECKPOINT_DIR, newmodel, tokenizer, optimizer, scheduler, scaler, epoch, step, best_val)
    print(f"Training complete.")


In [None]:
set_seed(CONFIG["SEED"])

In [None]:
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator)

In [None]:
evaluate_model(newmodel, test_loader, device)

In [None]:
!PYTHONPATH="/kaggle/working/evaluation-pipeline-2025" \
/kaggle/working/evaluation-pipeline-2025/eval_zero_shot_fast.sh \
/kaggle/working/baseline/hf_base \
v4 \
causal