In [1]:
# --- Section 1: Setup and Environment ---

# 1. Install Necessary Libraries (Run once if needed)
# !pip install -q -U transformers datasets peft accelerate bitsandbytes torch

# 2. Imports and Reproducibility
import torch
import numpy as np
import random
import os
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, set_seed
from datasets import load_dataset 

# Set seeds for reproducibility
SEED = 42
set_seed(SEED)

# 3. Define Model Name and Check GPU
MODEL_NAME = "google/flan-t5-base"

# --- DEFINE CONSTANTS HERE TO FIX NAME ERROR ---
PER_DEVICE_BATCH_SIZE = 4 
GRADIENT_ACCUMULATION_STEPS = 8 
# -----------------------------------------------

device_count = torch.cuda.device_count() if torch.cuda.is_available() else 1
EFFECTIVE_BATCH_SIZE = PER_DEVICE_BATCH_SIZE * device_count * GRADIENT_ACCUMULATION_STEPS
print(f"Detected {device_count} GPU(s). Effective global batch size: {EFFECTIVE_BATCH_SIZE}")

2025-12-13 14:11:14.210923: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765635074.387105      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765635074.435915      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Detected 2 GPU(s). Effective global batch size: 64


In [2]:
# --- Section 2: Data Loading and Preprocessing ---

# 1. Load Tokenizer
# Function: AutoTokenizer.from_pretrained (Hugging Face transformers)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 
print(f"Tokenizer loaded for: {MODEL_NAME}")

# 2. Load Dataset
DATASET_NAME = "bogdancazan/wikilarge-text-simplification"
# Function: load_dataset (Hugging Face datasets)
# The dataset has 'train' and 'validation' splits, suitable for SFT.
raw_datasets = load_dataset(DATASET_NAME) 
print(f"Dataset loaded: {DATASET_NAME}")
print(f"Training examples: {len(raw_datasets['train'])}")
print(f"Validation examples: {len(raw_datasets['validation'])}")

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Tokenizer loaded for: google/flan-t5-base


wiki.full.aner.ori.train.95.tsv:   0%|          | 0.00/36.3M [00:00<?, ?B/s]

wiki.full.aner.ori.valid.95.tsv: 0.00B [00:00, ?B/s]

wiki.full.aner.ori.test.95.tsv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/148843 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/494 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/191 [00:00<?, ? examples/s]

Dataset loaded: bogdancazan/wikilarge-text-simplification
Training examples: 148843
Validation examples: 494


In [3]:
print(raw_datasets["train"].column_names)


['Normal', 'Simple']


In [4]:
# Define constants for efficient training
MAX_INPUT_LENGTH = 512  # Max length for the complex (source) text
MAX_TARGET_LENGTH = 128 # Max length for the simplified (target) text

def preprocess_function(examples):
    # Instruction-style source construction (CRITICAL for FLAN / T5)
    sources = [
        "Simplify" + text
        for text in examples["Normal"]
    ]

    targets = examples["Simple"]

    model_inputs = tokenizer(
        sources,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding=False
    )

    # Tokenize targets as labels
    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding=False
    )

    # Replace pad token id in labels with -100
    labels["input_ids"] = [
        [(tok if tok != tokenizer.pad_token_id else -100) for tok in lab]
        for lab in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply mapping (batched)
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,  # remove raw text columns
    num_proc=1  # or os.cpu_count() if your environment supports it
)

train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

print("\n--- Dataset Preprocessing Complete ---")
print(f"Tokenized Training Set Size: {len(train_dataset)}")
print(f"Tokenized Validation Set Size: {len(eval_dataset)}")

Map (num_proc=1):   0%|          | 0/148843 [00:00<?, ? examples/s]

Map (num_proc=1):   0%|          | 0/494 [00:00<?, ? examples/s]

Map (num_proc=1):   0%|          | 0/191 [00:00<?, ? examples/s]


--- Dataset Preprocessing Complete ---
Tokenized Training Set Size: 148843
Tokenized Validation Set Size: 494


In [5]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForSeq2SeqLM

# 1. Load Base Model (FP32 for stability)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32
)

print(f"Base model {MODEL_NAME} loaded.")

# 2. Define LoRA Configuration (LOW-COMPUTE, STABLE)
lora_config = LoraConfig(
    r=8,                      # ↓ reduced rank (less compute, less overfitting)
    lora_alpha=16,            # ↓ scaled accordingly
    target_modules=["q", "k", "v", "o","wo"],  # FULL attention coverage for T5
    lora_dropout=0.05,        # mild regularization
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# 3. Wrap Model with LoRA Adapters
lora_model = get_peft_model(base_model, lora_config)

print("\n--- LoRA Model Setup Complete ---")
lora_model.print_trainable_parameters()


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Base model google/flan-t5-base loaded.

--- LoRA Model Setup Complete ---
trainable params: 2,310,144 || all params: 249,888,000 || trainable%: 0.9245


In [6]:
# --- Section 4: Training Execution ---

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq # Source 2.3, 4.2

# Output directory for saving checkpoints and results
OUTPUT_DIR = "flan-t5-lora-wikisimplification-stage1-small" 
PER_DEVICE_BATCH_SIZE = 4 # Small batch size per GPU
GRADIENT_ACCUMULATION_STEPS = 8 # Accumulate 8 steps to simulate a larger batch size
EFFECTIVE_BATCH_SIZE = PER_DEVICE_BATCH_SIZE * device_count * GRADIENT_ACCUMULATION_STEPS

# Function: Seq2SeqTrainingArguments (Hugging Face transformers)
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # --- FIX 1: Prevent "0.0" Loss / Dropped Labels ---
    # Explicitly tell Trainer these are the label columns
    label_names=["labels"], 
    # Prevent Trainer from dropping columns it thinks are "unused" by the PeftModel
    remove_unused_columns=False, 
    
    # --- FIX 2: Prevent "nan" Loss (Stability) ---
    # T5 + FP16 on T4 often causes NaNs. We must use FP32 (default) for stability.
    fp16=False, 
    
    # Optimization for Speed (since we lost FP16 speedup)
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    
    # General Parameters
    learning_rate=1e-4, 
    num_train_epochs=2,
    save_strategy="epoch", 
    logging_steps=50,       # Log more often to catch issues early
    eval_strategy="steps",
    eval_steps=200,         # Eval more often
    report_to="none",
    load_best_model_at_end=False,
)

print(f"Calculated Effective Global Batch Size: {EFFECTIVE_BATCH_SIZE}")


# 1. Data Collator (handles padding and preparing batches for seq2seq)
# Function: DataCollatorForSeq2Seq (Hugging Face transformers)
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=lora_model,           # optional, but okay
    label_pad_token_id=-100
)

# 2. Initialize Trainer
# Function: Seq2SeqTrainer (Hugging Face transformers)
trainer = Seq2SeqTrainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Calculated Effective Global Batch Size: 64


  trainer = Seq2SeqTrainer(


In [7]:
from torch.utils.data import DataLoader

dl = DataLoader(tokenized_datasets["train"], batch_size=1, collate_fn=data_collator)
batch = next(iter(dl))

# print shapes & dtypes
import torch
for k,v in batch.items():
    if isinstance(v, torch.Tensor):
        print(f"{k}: shape={v.shape}, dtype={v.dtype}, min={v.min().item()}, max={v.max().item()}")
    else:
        print(f"{k}: type={type(v)}")

# decode labels for visual check
label_ids = batch["labels"][0].tolist()
decoded_label = tokenizer.decode([tok for tok in label_ids if tok != -100], skip_special_tokens=True)
print("Decoded label example:", decoded_label)

# single forward pass to get loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lora_model.to(device)
batch = {k: v.to(device) for k, v in batch.items()}
lora_model.eval()
with torch.no_grad():
    outputs = lora_model(**batch)
print("Forward pass loss:", getattr(outputs, "loss", None))


input_ids: shape=torch.Size([1, 48]), dtype=torch.int64, min=1, max=23642
attention_mask: shape=torch.Size([1, 48]), dtype=torch.int64, min=1, max=1
labels: shape=torch.Size([1, 46]), dtype=torch.int64, min=1, max=23642
decoder_input_ids: shape=torch.Size([1, 46]), dtype=torch.int64, min=0, max=23642
Decoded label example: there is some proof that austen continued to work on these pieces later in life. her nephew and niece james edward and anna austen may have made further additions to her work in around.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Forward pass loss: tensor(2.7004, device='cuda:0')


In [8]:
# 3. Start Training
# Function: trainer.train() (Hugging Face transformers)
print("\n--- Starting Stage 1 LoRA Training (General Simplification) ---")
trainer_output = trainer.train()

print("\n--- Stage 1 Training Complete! ---")


--- Starting Stage 1 LoRA Training (General Simplification) ---




Step,Training Loss,Validation Loss
200,1.5113,1.399279
400,1.4561,1.356284
600,1.4051,1.334367
800,1.4232,1.319316
1000,1.3872,1.310298
1200,1.3699,1.305725
1400,1.4,1.29883
1600,1.3266,1.29728
1800,1.3499,1.291985
2000,1.3428,1.290001





--- Stage 1 Training Complete! ---


In [9]:
# --- Section 5: Checkpointing and Cleanup ---

CHECKPOINT_PATH = f"./{OUTPUT_DIR}/final_lora_adapters"

# 1. Save LoRA Adapters
# Function: model.save_pretrained (PEFT/Hugging Face transformers)
# CRUCIAL: PEFT's save_pretrained only saves the small adapter weights (MBs), not the large base model.
lora_model.save_pretrained(CHECKPOINT_PATH) # Source 3.3: save_pretrained (PEFT)

# 2. Save Tokenizer and Config (needed for loading later)
tokenizer.save_pretrained(CHECKPOINT_PATH) 

print("\n------------------------------------------------------------")
print("✅ STAGE 1 SUCCESSFUL: CHECKPOINTING COMPLETE")
print(f"LoRA Adapter weights saved to: {CHECKPOINT_PATH}")
print("You now need to download the contents of this folder and upload them to your Stage 3 instance.")
print("------------------------------------------------------------")

# 3. Cleanup GPU Memory
# Function: torch.cuda.empty_cache (PyTorch)
if torch.cuda.is_available():
    torch.cuda.empty_cache()


------------------------------------------------------------
✅ STAGE 1 SUCCESSFUL: CHECKPOINTING COMPLETE
LoRA Adapter weights saved to: ./flan-t5-lora-wikisimplification-stage1-small/final_lora_adapters
You now need to download the contents of this folder and upload them to your Stage 3 instance.
------------------------------------------------------------


In [10]:
import os
import shutil

# --- Configuration ---
# The name of the folder containing the checkpoints
FOLDER_TO_ZIP = "/kaggle/working/flan-t5-lora-wikisimplification-stage1" 
# The name for the resulting zip file
OUTPUT_ZIP_FILE = "flan-t5-lora-stage1-results.zip"

print(f"Starting compression of '{FOLDER_TO_ZIP}'...")

# 1. Ensure the folder exists before attempting to zip
if not os.path.isdir(FOLDER_TO_ZIP):
    print(f"Error: Directory '{FOLDER_TO_ZIP}' not found.")
else:
    # 2. Use shutil.make_archive for a robust, cross-platform zip creation
    # It creates a zip file named 'OUTPUT_ZIP_FILE' (without the .zip extension in the base_name)
    # containing the contents of 'FOLDER_TO_ZIP'.
    try:
        shutil.make_archive(
            base_name=OUTPUT_ZIP_FILE.replace(".zip", ""), 
            format='zip', 
            root_dir=os.path.dirname(FOLDER_TO_ZIP), 
            # The base directory to start from. We zip the entire folder.
            base_dir=os.path.basename(FOLDER_TO_ZIP)
        )
        print(f"Compression complete. Zip file created: {OUTPUT_ZIP_FILE}")
        print("\n--- Next Steps ---")
        print("1. Click the 'Refresh' button in your Kaggle file pane (the file browser usually on the right).")
        print(f"2. Locate '{OUTPUT_ZIP_FILE}' in the output section.")
        print("3. Click the three dots (...) next to the zip file and select 'Download'.")
    except Exception as e:
        print(f"An error occurred during zipping: {e}")

Starting compression of '/kaggle/working/flan-t5-lora-wikisimplification-stage1'...
Error: Directory '/kaggle/working/flan-t5-lora-wikisimplification-stage1' not found.
