# t5-small finetuned char word sentence level tokenized and predict

In [None]:
import os
!pip uninstall -y numpy transformers datasets
!pip install numpy --force-reinstall --no-cache-dir
!pip install transformers datasets --force-reinstall --no-cache-dir


os.kill(os.getpid(), 9)  # Restart the Colab runtime (REQUIRED)

In [None]:
!git clone https://github.com/babylm/baseline-pretraining.git
%cd baseline-pretraining
!wget -O babylm_data.zip "https://files.osf.io/v1/resources/ad7qg/providers/osfstorage/661517db943bee3731dfec25/?zip="
!unzip babylm_data.zip -d babylm_data
!unzip babylm_data/train_10M.zip -d babylm_data/train_10M
!unzip babylm_data/dev.zip -d babylm_data/dev
!unzip babylm_data/test.zip -d babylm_data/test
!cat babylm_data/train_10M/train_10M/*.train > babylm_data/babylm_train.txt
!cat babylm_data/dev/dev/*.dev > babylm_data/babylm_dev.txt
!cat babylm_data/test/test/*.test > babylm_data/babylm_test.txt

In [None]:
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset, DatasetDict
from pathlib import Path
import torch
import os

# ==========
# Setup
# ==========
DATA_DIR = Path("/content/baseline-pretraining/babylm_data")
assert DATA_DIR.exists(), "Dataset folder not found!"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ==========
# Load Model & Tokenizer
# ==========
model_path = "/content/drive/MyDrive/llm-project/t5-small-babylm-denoised"
tokenizer = T5Tokenizer.from_pretrained(model_path, extra_ids=0)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)

# ==========
# Load & Clean Dataset
# ==========
def load_text_file_as_dataset(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    lines = [line.strip() for line in lines if line.strip()]
    return Dataset.from_list([{"text": line} for line in lines])

dataset = DatasetDict({
    "train": load_text_file_as_dataset(DATA_DIR / "babylm_train.txt"),
    "validation": load_text_file_as_dataset(DATA_DIR / "babylm_dev.txt"),
    "test": load_text_file_as_dataset(DATA_DIR / "babylm_test.txt"),
})

# Clean and filter
dataset = dataset.filter(lambda x: x['text'] and x['text'].strip())

# Optional: reduce size
dataset['train'] = dataset['train'].select(range(min(100000, len(dataset['train']))))
dataset['validation'] = dataset['validation'].select(range(min(5000, len(dataset['validation']))))
dataset['test'] = dataset['test'].select(range(min(5000, len(dataset['test']))))

# Filter short lines
def is_valid(example):
    return len(example['text'].split()) >= 3

dataset = dataset.filter(is_valid)

# ==========
# Multi-Level Tokenization
# ==========
MAX_LEN = 64

# Prepares input-target pairs for next-char, next-word, next-sentence
# Join them with <sep> so the model gets all tasks in parallel

def multi_task_example(example):
    text = example["text"]

    # Char-level
    char_input = text[:-1]
    char_target = text[-1]

    # Word-level
    words = text.split()
    word_input = " ".join(words[:-1])
    word_target = words[-1]

    # Sentence-level (very simple split)
    sentences = text.split(".")
    sentences = [s.strip() for s in sentences if s.strip()]
    if len(sentences) < 2:
        sent_input = text
        sent_target = ""
    else:
        sent_input = ". ".join(sentences[:-1])
        sent_target = sentences[-1]

    input_text = f"char: {char_input} <sep> word: {word_input} <sep> sent: {sent_input}"
    target_text = f"{char_target} <sep> {word_target} <sep> {sent_target}"

    tokenized = tokenizer(
        input_text,
        text_target=target_text,
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length"
    )

    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["labels"]
    }


# Apply tokenization
for split in dataset:
    dataset[split] = dataset[split].map(multi_task_example, remove_columns=["text"])
    dataset[split].set_format(type="torch")

# ==========
# Data Collator
# ==========
collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    padding="longest"
)

# ==========
# Training
# ==========
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/llm-project/t5-small-multitask-finetuned",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_steps=200,
    logging_steps=500,
    eval_steps=1000,
    save_steps=1000,
    eval_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to=[],
    label_smoothing_factor=0.1,
    remove_unused_columns=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=collator,
    tokenizer=tokenizer,
)

trainer.train()

# ==========
# Evaluate & Save
# ==========
test_results = trainer.evaluate(dataset["test"], metric_key_prefix="test")
print("Test Results:", test_results)

trainer.save_model()
tokenizer.save_pretrained(args.output_dir)
