In [None]:
import os
!pip uninstall -y numpy transformers datasets
!pip install numpy --force-reinstall --no-cache-dir
!pip install transformers datasets --force-reinstall --no-cache-dir


os.kill(os.getpid(), 9)  # Restart the Colab runtime (REQUIRED)

In [None]:
!git clone https://github.com/babylm/baseline-pretraining.git
%cd baseline-pretraining
!wget -O babylm_data.zip "https://files.osf.io/v1/resources/ad7qg/providers/osfstorage/661517db943bee3731dfec25/?zip="
!unzip babylm_data.zip -d babylm_data
!unzip babylm_data/train_10M.zip -d babylm_data/train_10M
!unzip babylm_data/dev.zip -d babylm_data/dev
!unzip babylm_data/test.zip -d babylm_data/test
!cat babylm_data/train_10M/train_10M/*.train > babylm_data/babylm_train.txt
!cat babylm_data/dev/dev/*.dev > babylm_data/babylm_dev.txt
!cat babylm_data/test/test/*.test > babylm_data/babylm_test.txt

# Finetuning T5small on babylm 10M dataset with random masking

In [None]:
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments,
    DataCollatorForSeq2Seq, get_cosine_schedule_with_warmup
)
from datasets import load_dataset
from torch.nn import CrossEntropyLoss
import torch
import os
import random
import numpy as np
from typing import List, Dict
from datasets import DatasetDict, Dataset

# =====================
# Settings & Constants
# =====================
#os.environ['BABYLM_ROOT_DIR'] = '/content/baseline-pretraining/babylm_data'
from pathlib import Path

BABYLM_ROOT_DIR = Path("/content/baseline-pretraining/babylm_data")
assert BABYLM_ROOT_DIR.exists(), "Dataset directory does not exist!"

os.environ['WANDB_DISABLED'] = 'true'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA Available:", torch.cuda.is_available())

# ================
# Load tokenizer
# ================
tokenizer = T5Tokenizer.from_pretrained("t5-small")
EXTRA_IDS = 10
tokenizer.add_special_tokens({'additional_special_tokens': [f"<extra_id_{i}>" for i in range(EXTRA_IDS)]})

# ==================
# Load model
# ==================
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# ==================
# Span Corruption Function
# ==================
def random_spans_noise_mask(length, noise_density=0.15, mean_span_length=3):
    num_noise_tokens = int(length * noise_density)
    num_spans = max(1, num_noise_tokens // mean_span_length)
    span_lengths = np.random.poisson(mean_span_length, num_spans)
    span_lengths = np.clip(span_lengths, 1, length)
    span_starts = sorted(random.sample(range(length), num_spans))
    mask = np.zeros(length, dtype=bool)
    for start, span_len in zip(span_starts, span_lengths):
        mask[start:start+span_len] = True
    return mask

def t5_denoising_example(example):
    text = example["text"]
    tokens = tokenizer.tokenize(text)
    if len(tokens) < 2:
        return None  # This cleanly skips the example

    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    mask = random_spans_noise_mask(len(token_ids))

    input_tokens = []
    target_tokens = []
    current_extra_id = 0
    i = 0
    while i < len(token_ids):
        if mask[i]:
            input_tokens.append(tokenizer.convert_tokens_to_ids(f"<extra_id_{current_extra_id}>"))
            span = []
            while i < len(token_ids) and mask[i]:
                span.append(token_ids[i])
                i += 1
            target_tokens.extend([tokenizer.convert_tokens_to_ids(f"<extra_id_{current_extra_id}>")] + span)
            current_extra_id += 1
        else:
            input_tokens.append(token_ids[i])
            i += 1

    input_ids = tokenizer.prepare_for_model(input_tokens, max_length=64, padding="max_length", truncation=True)["input_ids"]
    labels = tokenizer.prepare_for_model(target_tokens, max_length=64, padding="max_length", truncation=True)["input_ids"]
    return {"input_ids": input_ids, "labels": labels}



# ================
# Load Dataset
# ================

def load_text_file_as_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines if line.strip()]  # Remove empty lines
    return Dataset.from_list([{"text": line} for line in lines])

dataset = DatasetDict({
    "train": load_text_file_as_dataset(BABYLM_ROOT_DIR / "babylm_train.txt"),
    "validation": load_text_file_as_dataset(BABYLM_ROOT_DIR / "babylm_dev.txt"),
    "test": load_text_file_as_dataset(BABYLM_ROOT_DIR / "babylm_test.txt"),
})



# Remove empty lines
dataset = dataset.filter(lambda x: x['text'] and x['text'].strip())

# Sort + limit validation/test
dataset['train'] = dataset['train'].map(lambda x: {"length": len(x['text'].split())}).sort('length').remove_columns(['length'])
dataset['validation'] = dataset['validation'].select(range(min(5000, len(dataset['validation']))))
dataset['test'] = dataset['test'].select(range(min(5000, len(dataset['test']))))

def is_valid(example):
    return len(tokenizer.tokenize(example["text"])) >= 5

dataset["train"] = dataset["train"].filter(is_valid)
dataset["validation"] = dataset["validation"].filter(is_valid)
dataset["test"] = dataset["test"].filter(is_valid)

# Apply T5-style denoising tokenization
#dataset = dataset.map(t5_denoising_example, remove_columns=['text'])
dataset = dataset.map(
    t5_denoising_example,
    remove_columns=['text'],
    batched=False,
    desc="Applying span corruption"
)

dataset.set_format(type='torch')

# ==========================
# Data Collator
# ==========================
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, label_pad_token_id=-100)

# ==========================
# TrainingArguments with label smoothing, weight decay, and scheduler
# ==========================
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/llm-project/t5-small-babylm-denoised",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    learning_rate=3e-4,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir=str(BABYLM_ROOT_DIR / "logs"),
    logging_steps=500,
    eval_steps=4000,
    save_steps=4000,
    eval_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to=[],
    label_smoothing_factor=0.1,
    remove_unused_columns=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=collator,
    tokenizer=tokenizer,
)

trainer.train()

# Evaluate
test_results = trainer.evaluate(dataset["test"], metric_key_prefix="test")
print("Test Results:", test_results)

# Save
trainer.save_model()
tokenizer.save_pretrained(args.output_dir)
