In [None]:
# ===============================
# 🔧 Setup: Install Packages
# ===============================
!pip install -q \
  "transformers>=4.41,<5" \
  "datasets==2.19.1" \
  "peft==0.10.0" \
  "accelerate>=0.34.2" \
  "bitsandbytes>=0.43.3" \
  "scikit-learn" \
  "openpyxl" \
  "pandas"

In [None]:
import torch, sys, subprocess
mm = ".".join(torch.__version__.split(".")[:2])
triton_by_torch = {"2.5":"3.2.0","2.4":"3.0.0","2.3":"2.3.1","2.2":"2.2.0"}
target = triton_by_torch.get(mm, "3.2.0")
print(f"Torch {torch.__version__} → Installing Triton {target}")
subprocess.check_call([sys.executable, "-m", "pip", "install", f"triton=={target}"])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, random, torch, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    TrainingArguments, Trainer, set_seed
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from huggingface_hub import login

# --------------- Hugging Face token ---------------
os.environ["HF_TOKEN"] = "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"
login(os.environ["HF_TOKEN"])

# --------------- Reproducibility ---------------
set_seed(42)

In [None]:
# ============================================
# Build chat records for SFT
#         • system    - task + constraints + example
#         • user      - cue word only
#         • assistant - gold 3-word answer
#         • rendered with tokenizer.apply_chat_template(...)
# ============================================
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from textwrap import dedent
import pandas as pd
import os

# ---------- Model (for chat template rendering) ----------
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # explicit pad for Llama-3
tokenizer.padding_side = "right"     # training
tokenizer.truncation_side = "left"   # keep the assistant answer in-frame

# ---------- system prompt ----------
SYSTEM_PROMPT = dedent("""\
Task:
 - You will be provided with an input word: write the first 3 words you associate to it separated by a comma.
 - No additional output text is allowed.

Constraints:
 - no carriage return characters are allowed in the answers.
 - answers should be as short as possible.

Example:
 Input: sea
 Output: water, beach, sun""").strip()

def build_chat(cue: str, responses: list[str]) -> str:
    assistant_txt = ", ".join(responses)  # space after each comma
    messages = [
        {"role": "system",    "content": SYSTEM_PROMPT},
        {"role": "user",      "content": cue},
        {"role": "assistant", "content": assistant_txt},
    ]
    # Render full conversation (with assistant content) as a string
    return tokenizer.apply_chat_template(
        messages, add_generation_prompt=False, tokenize=False
    )

def df_to_records(df: pd.DataFrame):
    recs = []
    for _, row in df.iterrows():
        cue  = str(row["cue"]).strip()
        resp = [str(row["R1"]).strip(), str(row["R2"]).strip(), str(row["R3"]).strip()]
        recs.append(dict(cue=cue, text=build_chat(cue, resp)))
    return recs

# ---------- Load data ----------
BASE_PATH = r"/content/drive/My Drive/associations-ANLP"
DATA_DIR = os.path.join(BASE_PATH, r"data/final_processed_SWOW_data")
train_df = pd.read_excel(os.path.join(DATA_DIR, "train.xlsx"))
val_df   = pd.read_excel(os.path.join(DATA_DIR, "val.xlsx"))
test_df  = pd.read_excel(os.path.join(DATA_DIR, "test.xlsx"))

# ---------- Generate records ----------
train_recs = df_to_records(train_df)
val_recs   = df_to_records(val_df)
test_recs  = df_to_records(test_df)



# ---------- Wrap in DatasetDict ----------
data = DatasetDict({
    "train":      Dataset.from_list(train_recs),
    "validation": Dataset.from_list(val_recs),
    "test":       Dataset.from_list(test_recs),
})

print({k: len(v) for k, v in data.items()})
display(data["train"][0])


In [None]:
# =========================================================
# Tokenise pre-rendered chat text (already templated)
#   • input: columns {cue, text} where 'text' is chat-template rendered
#   • output: input_ids & attention_mask (keep "text" for debugging/export)
# =========================================================
from transformers import AutoTokenizer
import pandas as pd, os

MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token   # Llama-3 needs explicit pad token
tokenizer.padding_side = "right"     # training
tokenizer.truncation_side = "left"   # keep the assistant answer in-frame


# We already rendered via apply_chat_template in Cell 1
assert "text" in data["train"].column_names, "Expected a 'text' column with rendered chat."

def tok_fn(batch):
    return tokenizer(batch["text"], padding=False, truncation=True, max_length=1024)

tokenised = data.map(tok_fn, batched=True)

# Keep only what we need (+ 'text' if you want to export/inspect)
tokenised = tokenised.remove_columns(
    [c for c in tokenised["train"].column_names if c not in {"input_ids","attention_mask","text"}]
)

print("Example token IDs:", tokenised["train"][0]["input_ids"][:20])

In [None]:
# ============================================
# LoRA-prepared quantised model (q,v; r8; dropout 0.10)
# ============================================
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch, os

# ---- Hyperparameters ----
LORA_R, LORA_ALPHA, LORA_DROPOUT = 16, 32, 0.10
NUM_EPOCHS      = 1
BATCH_SIZE      = 16
GRAD_ACC_STEPS  = 4
LR              = 1e-4

# ---- LoRA target modules (q,k,v,o) ----
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]

# ---- Count unique cues for run name (requires 'data' from earlier cell) ----
try:
    N_TRAIN_CUES = len(set(data["train"]["cue"]))
    N_VAL_CUES   = len(set(data["validation"]["cue"]))
except Exception:
    N_TRAIN_CUES = -1
    N_VAL_CUES   = -1

# Build a compact target tag for the run name
_letter_map = {"q_proj": "q", "k_proj": "k", "v_proj": "v", "o_proj": "o",
               "up_proj": "up", "down_proj": "dn", "gate_proj": "gt"}
_order = {"q":0,"k":1,"v":2,"o":3}
_target_letters = sorted([_letter_map.get(m, m) for m in TARGET_MODULES],
                         key=lambda x: _order.get(x, 99))
TARGET_TAG = "tgt_" + "".join(_target_letters)

RUN_NAME = (
    f"full_llama3_8b_system_prompt_lora_SFT_SWOW_{TARGET_TAG}"
    f"_tr{N_TRAIN_CUES}c_val{N_VAL_CUES}c"
    f"_r{LORA_R}_a{LORA_ALPHA}_do{str(LORA_DROPOUT).replace('.','p')}"
    f"_lr{LR:g}_bs{BATCH_SIZE}_ga{GRAD_ACC_STEPS}"
)

BASE_PATH = r"/content/drive/My Drive/associations-ANLP"
OUTPUT_DIR = os.path.join(BASE_PATH, RUN_NAME)
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Output dir:", OUTPUT_DIR)

# ---- 4-bit quant loader (bf16 compute on A100) ----
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto", quantization_config=bnb_cfg
)
base_model = prepare_model_for_kbit_training(base_model)

lora_cfg = LoraConfig(
    r=LORA_R, lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=TARGET_MODULES,
    bias="none", task_type="CAUSAL_LM",
)
model = get_peft_model(base_model, lora_cfg)

# (optional, helps memory)
model.gradient_checkpointing_enable()
try:
    model.enable_input_require_grads()
except AttributeError:
    pass

model.print_trainable_parameters()

# ---- SFT collator with LABEL MASKING ----
IGNORE_INDEX = -100

class MaskedSFTCollator:
    def __init__(self, tok):
        self.tok = tok
        self.assistant_header = "<|start_header_id|>assistant<|end_header_id|>\n"

    def __call__(self, features):
        features = [{k: v for k, v in f.items()
                     if k in ("input_ids", "attention_mask")} for f in features]
        batch = self.tok.pad(features, padding="longest", return_tensors="pt")
        input_ids = batch["input_ids"]
        labels = input_ids.clone()

        # mask PAD tokens so they never contribute to loss
        am = batch["attention_mask"]              # (B, T)
        labels[am == 0] = -100

        # mask everything up to (and including) the assistant header
        for i, ids in enumerate(input_ids):
            txt = self.tok.decode(ids, skip_special_tokens=False)
            pos = txt.find(self.assistant_header)
            if pos == -1:
                labels[i].fill_(-100)
                continue
            prefix = txt[: pos + len(self.assistant_header)]
            cutoff = len(self.tok(prefix, add_special_tokens=False)["input_ids"])
            labels[i, :cutoff] = -100

        batch["labels"] = labels
        return batch

collator = MaskedSFTCollator(tokenizer)

# Quick sanity check once
from torch.utils.data import DataLoader
dl = DataLoader(tokenised["train"], batch_size=BATCH_SIZE, collate_fn=collator)
batch = next(iter(dl))
assert (batch["labels"] != -100).any(), "All labels masked → assistant header not found"
print(f"Masking sanity check passed | RUN_NAME={RUN_NAME}")


In [None]:
# ============================================
# TrainingArguments & Trainer
# ============================================
from transformers import TrainingArguments, Trainer
from transformers.trainer_callback import EarlyStoppingCallback  # optional

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LR,

    # Use the QLoRA-friendly optimizer
    optim="paged_adamw_8bit",

    # Stability & regularization
    weight_decay=0.05,
    max_grad_norm=1.0,

    # Schedule
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,

    # Precision
    bf16=True, fp16=False,
    # Also expose GC flag here (in addition to model.gradient_checkpointing_enable())
    gradient_checkpointing=True,

    # Logging / eval / save cadence
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=800,                 # was 400
    save_strategy="steps",
    save_steps=800,                 # was 400
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    report_to="none",
    remove_unused_columns=False,

    # Throughput niceties
    group_by_length=True,
    dataloader_num_workers=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised["train"],
    eval_dataset=tokenised["validation"],
    data_collator=collator,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # <-- enable only if epochs>1
)

In [None]:
# ============================================
# Train (resume aware)
# ============================================
from transformers.trainer_utils import get_last_checkpoint
import time, numpy as _np, torch.serialization as _ser, inspect, numpy as np

if hasattr(_ser, "_user_allowed_globals"):
    _ser._user_allowed_globals.clear()
_ser.add_safe_globals([
    _np.core.multiarray._reconstruct, _np.ndarray, _np.dtype,
    *[cls for _, cls in inspect.getmembers(_np.dtypes, inspect.isclass) if issubclass(cls, _np.dtype)]
])

start = time.time()
ckpt = get_last_checkpoint(OUTPUT_DIR)
if ckpt is None:
    print("No checkpoint – starting fresh")
    trainer.train()
else:
    print(f"Resuming from {ckpt}")
    trainer.train(resume_from_checkpoint=ckpt)

print(f"Elapsed (min): {(time.time()-start)/60:.1f}")

In [None]:
# ============================================
# Save LoRA adapter + merged model
# ============================================
adapter_path = os.path.join(OUTPUT_DIR, "lora_adapter")
merged_path  = os.path.join(OUTPUT_DIR, "merged_model")
os.makedirs(adapter_path, exist_ok=True); os.makedirs(merged_path, exist_ok=True)

print(" Saving raw LoRA adapter …")
trainer.model.save_pretrained(adapter_path)
tokenizer.save_pretrained(adapter_path)

print(" Merging LoRA weights into base model …")
with torch.no_grad():
    merged_lm = trainer.model.merge_and_unload()

print(" Saving merged model …")
merged_lm.save_pretrained(merged_path, safe_serialization=True)
tokenizer.save_pretrained(merged_path)

print(" Finished!\n • Adapter →", adapter_path, "\n • Merged model →", merged_path)

In [None]:
# Disconnect the runtime
from google.colab import runtime
runtime.unassign()