In [None]:
from google.colab import auth
auth.authenticate_user()

project_id = "aml-final-project-480821"
!gcloud config set project {project_id}

Updated property [core/project].


In [None]:
# ============================================================
# 1. Install Dependencies
# ============================================================
!pip install -q transformers datasets peft accelerate bitsandbytes

In [None]:
# ============================================================
# 2. Load Tokenizer and Base BART Model
# ============================================================
from transformers import BartTokenizer, BartForConditionalGeneration
from peft import LoraConfig, get_peft_model

model_name = "facebook/bart-base"

tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [None]:
# ============================================================
# 3. Apply LoRA to BART (fast lightweight finetuning)
# ============================================================
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj"]  # Attention layers
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 141,189,888 || trainable%: 1.2533


In [None]:
# ============================================================
# 4. Load & Flatten ASSET and Synthetic Datasets
# ============================================================
import json

# ---------- Load raw datasets ----------
with open("asset_train.json") as f:
    asset_raw = json.load(f)

with open("synthetic_train.json") as f:
    synthetic_raw = json.load(f)

print("Loaded ASSET entries:", len(asset_raw))
print("Loaded synthetic entries:", len(synthetic_raw))


# ---------- Flatten ASSET ----------
asset_flat = []

for item in asset_raw:
    orig = item.get("original")

    if isinstance(item.get("simplifications"), list):
        for simp in item["simplifications"]:
            if isinstance(orig, str) and isinstance(simp, str) and orig.strip() and simp.strip():
                asset_flat.append({
                    "original": orig.strip(),
                    "simplified": simp.strip()
                })

print("Flattened ASSET size:", len(asset_flat))


# ---------- Flatten synthetic ----------
synthetic_flat = []

for item in synthetic_raw:
    orig = item.get("original")

    # Synthetic also uses "simplifications" (list)
    if isinstance(item.get("simplifications"), list):
        for simp in item["simplifications"]:
            if isinstance(orig, str) and isinstance(simp, str) and orig.strip() and simp.strip():
                synthetic_flat.append({
                    "original": orig.strip(),
                    "simplified": simp.strip()
                })

print("Flattened synthetic size:", len(synthetic_flat))


# ---------- Combine & save ----------
combined = asset_flat + synthetic_flat
print("Total combined examples:", len(combined))

with open("combined_train.json", "w") as f:
    json.dump(combined, f, indent=2)

Loaded ASSET entries: 2000
Loaded synthetic entries: 25330
Flattened ASSET size: 20000
Flattened synthetic size: 25330
Total combined examples: 45330


In [None]:
# ============================================================
# 5. Load dataset using HuggingFace Datasets
# ============================================================
from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": "combined_train.json"})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['original', 'simplified'],
        num_rows: 45330
    })
})

In [None]:
# ============================================================
# 6. Tokenization Function
# ============================================================
def preprocess(batch):
    inputs = tokenizer(
        batch["original"],
        max_length=64,
        padding="max_length",   # ✅ REQUIRED
        truncation=True
    )

    labels = tokenizer(
        batch["simplified"],
        max_length=64,
        padding="max_length",   # ✅ REQUIRED
        truncation=True
    )

    inputs["labels"] = labels["input_ids"]
    return inputs

# Apply tokenization to entire dataset
tokenized = dataset.map(preprocess, batched=True)
tokenized

Map:   0%|          | 0/45330 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['original', 'simplified', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 45330
    })
})

In [None]:
# ============================================================
# 7. Training Setup (Seq2SeqTrainer)
# ============================================================
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="bart-lora-asset",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=2e-4,
    num_train_epochs=2,
    predict_with_generate=True,
    logging_steps=50,
    save_strategy="epoch",
    fp16=True,   # works on Colab GPU
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
)

In [None]:
# ============================================================
# 8. Train LoRA-BART on ASSET
# ============================================================
trainer.train()

Step,Training Loss
50,9.3626
100,4.9613
150,4.2333
200,4.0845
250,4.0761
300,4.0857
350,4.0375
400,3.9398
450,4.0233
500,3.9197


TrainOutput(global_step=22666, training_loss=3.770808063979331, metrics={'train_runtime': 1979.5281, 'train_samples_per_second': 45.799, 'train_steps_per_second': 11.45, 'total_flos': 3516520613806080.0, 'train_loss': 3.770808063979331, 'epoch': 2.0})

In [None]:
# ============================================================
# 9. Save LoRA Adapter
# ============================================================
model.save_pretrained("bart-asset-lora")
tokenizer.save_pretrained("bart-asset-lora")
print("Model saved!")

Model saved!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os

# Make a folder in your Drive to save the model
save_path = '/content/drive/MyDrive/bart-asset-lora'
os.makedirs(save_path, exist_ok=True)

# Save model locally first (optional)
model.save_pretrained("bart-asset-lora")
tokenizer.save_pretrained("bart-asset-lora")

# Copy folder to Drive
!cp -r bart-asset-lora /content/drive/MyDrive/
print("Model saved to Google Drive!")

Mounted at /content/drive
Model saved to Google Drive!
