In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [1]:
!pip -q install "transformers>=4.44.0" datasets bitsandbytes accelerate peft safetensors

import os, time, json, math, random
from dataclasses import dataclass
from typing import Dict, List, Any

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments,
    BitsAndBytesConfig, TrainerCallback
)
from peft import (
    LoraConfig, get_peft_model, prepare_model_for_kbit_training,
    PeftModel, TaskType
)

BASE_MODEL = os.getenv("BASE_MODEL", "Qwen/Qwen3-8B")    # alt: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "/kaggle/working/polaris-mn-qlora-qwen3-8b")
MAX_LEN     = int(os.getenv("MAX_LEN", "1536"))   # 1536 to stay comfy on T4
LR          = float(os.getenv("LR", "2e-4"))
EPOCHS      = float(os.getenv("EPOCHS", "1"))
BATCH_PER_DEV = int(os.getenv("BATCH_PER_DEV", "1"))
GRAD_ACC    = int(os.getenv("GRAD_ACC", "8"))   
WARMUP      = float(os.getenv("WARMUP", "0.03"))
LORA_R      = int(os.getenv("LORA_R", "16"))
LORA_ALPHA  = int(os.getenv("LORA_ALPHA", "32"))
LORA_DROPOUT= float(os.getenv("LORA_DROPOUT", "0.05"))
SAVE_MERGED = os.getenv("SAVE_MERGED", "0") == "1"  

SYSTEM_PROMPT = "You are POLARIS, a precise, fair news summariser. Be neutral; cite key numbers/dates/actors."

TRAIN_SAMPLES = int(os.getenv("TRAIN_SAMPLES", "4000"))
EVAL_SAMPLES  = int(os.getenv("EVAL_SAMPLES",  "400"))

def load_qlora_base(model_name: str):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    )
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        low_cpu_mem_usage=True,
    )
    model.config.use_cache = False  
    model = prepare_model_for_kbit_training(model)
    lora_cfg = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
        target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
        bias="none"
    )
    model = get_peft_model(model, lora_cfg)
    return tok, model

def build_chat_strings(document: str, summary: str):
    user = (
        "Summarise the following multi-article news package into a balanced digest. "
        "State agreements and any conflicts; include key numbers/dates/actors.\n\nDOCUMENT:\n"
        + document.strip()
    )
    msgs_full = [
        {"role":"system","content":SYSTEM_PROMPT},
        {"role":"user","content":user},
        {"role":"assistant","content":summary.strip()}
    ]
    msgs_prompt = [
        {"role":"system","content":SYSTEM_PROMPT},
        {"role":"user","content":user},
        {"role":"assistant","content":""}
    ]
    return msgs_full, msgs_prompt

def tokenize_example(example, tokenizer: AutoTokenizer, max_len: int):
    msgs_full, msgs_prompt = build_chat_strings(example["document"], example["summary"])
    full_text   = tokenizer.apply_chat_template(msgs_full,   tokenize=False, add_generation_prompt=False)
    prompt_text = tokenizer.apply_chat_template(msgs_prompt, tokenize=False, add_generation_prompt=False)

    full_ids   = tokenizer(full_text,   truncation=True, max_length=max_len)
    prompt_ids = tokenizer(prompt_text, truncation=True, max_length=max_len)

    input_ids = full_ids["input_ids"]
    labels    = input_ids.copy()

    prompt_len = len(prompt_ids["input_ids"])
    labels[:prompt_len] = [-100] * prompt_len  # mask prompt (only learn the answer)

    return {"input_ids": input_ids, "labels": labels}

@dataclass
class DataCollatorForCausalSupervised:
    tokenizer: AutoTokenizer
    pad_to_multiple_of: int = 8
    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        batch_input = self.tokenizer.pad(
            {"input_ids": [f["input_ids"] for f in features]},
            padding=True, return_tensors="pt",
            pad_to_multiple_of=self.pad_to_multiple_of,
        )
        max_len = batch_input["input_ids"].size(1)
        batch_labels = []
        for f in features:
            l = f["labels"]
            if len(l) < max_len:
                l = l + ([-100] * (max_len - len(l)))
            batch_labels.append(l[:max_len])
        batch_input["labels"] = torch.tensor(batch_labels, dtype=torch.long)
        return batch_input

class EtaLogger(TrainerCallback):
    def __init__(self, log_path="/kaggle/working/metrics.jsonl"):
        self.start = None
        self.max_steps = None
        self.log_path = log_path
    def on_train_begin(self, args, state, control, **kwargs):
        self.start = time.time()
    def on_log(self, args, state, control, logs=None, **kwargs):
        now = time.time()
        steps_done = state.global_step or 0
        if steps_done == 0: 
            return
        if state.max_steps and state.max_steps > 0:
            self.max_steps = state.max_steps
        elapsed = now - self.start
        rate = elapsed / max(1, steps_done)     # sec/step
        if self.max_steps:
            remaining = rate * (self.max_steps - steps_done)
        else:
            # fallback: estimate from logs.get("num_steps_per_epoch", …) if present
            remaining = float("nan")
        def hms(t):
            if not (t==t) or math.isinf(t): return "?"
            m, s = divmod(int(t), 60)
            h, m = divmod(m, 60)
            return f"{h:02d}:{m:02d}:{s:02d}"
        msg = f"[ETA] step {steps_done}/{self.max_steps or '?'} | {1.0/rate:.2f} steps/s | ETA {hms(remaining)} | elapsed {hms(elapsed)}"
        print(msg)
        # also append to jsonl
        rec = {"time": now, "step": steps_done, "rate_steps_per_s": (1.0/rate), "eta_s": None if not self.max_steps else remaining}
        try:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(rec) + "\n")
        except:
            pass

print("[info] Loading Multi-News...")
ds_train = load_dataset("alexfabbri/multi_news", split="train", revision="refs/convert/parquet")
ds_eval  = load_dataset("alexfabbri/multi_news", split="validation", revision="refs/convert/parquet")

def ok(e): return bool(e.get("document","").strip()) and bool(e.get("summary","").strip())
ds_train = ds_train.filter(ok)
ds_eval  = ds_eval.filter(ok)

if TRAIN_SAMPLES > 0 and TRAIN_SAMPLES < len(ds_train):
    ds_train = ds_train.select(range(TRAIN_SAMPLES))
if EVAL_SAMPLES > 0 and EVAL_SAMPLES < len(ds_eval):
    ds_eval = ds_eval.select(range(EVAL_SAMPLES))

print("[info] Examples:", len(ds_train), "train /", len(ds_eval), "eval")

tok, model = load_qlora_base(BASE_MODEL)

def map_fn(example): return tokenize_example(example, tok, MAX_LEN)
ds_train = ds_train.map(map_fn, remove_columns=ds_train.column_names, num_proc=2)
ds_eval  = ds_eval.map(map_fn,  remove_columns=ds_eval.column_names,  num_proc=2)

collator = DataCollatorForCausalSupervised(tok)

total_train_tokens = sum(len(x["input_ids"]) for x in ds_train)
print(f"[info] tokenized train examples={len(ds_train)}  eval examples={len(ds_eval)}  ~train tokens={total_train_tokens:,}")


from inspect import signature, Parameter

bf16_ok = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
base_kwargs = dict(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_PER_DEV,
    per_device_eval_batch_size=max(1, BATCH_PER_DEV),
    gradient_accumulation_steps=GRAD_ACC,
    warmup_ratio=WARMUP,
    lr_scheduler_type="cosine",
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    bf16=bf16_ok,
    fp16=(not bf16_ok),
    gradient_checkpointing=True,
    ddp_find_unused_parameters=False,
    report_to="none",
)

sig = signature(TrainingArguments.__init__)
params = sig.parameters

base_kwargs["max_steps"] = 100
base_kwargs["save_steps"] = 50
base_kwargs["logging_steps"] = 10
if "evaluation_strategy" in params:
    base_kwargs["evaluation_strategy"] = "steps"
    base_kwargs["eval_steps"] = 200
elif "eval_strategy" in params:
    base_kwargs["eval_strategy"] = "steps"
    base_kwargs["eval_steps"] = 200

args = TrainingArguments(**base_kwargs)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
    data_collator=collator,
    tokenizer=tok,
    callbacks=[EtaLogger()],
)

trainer.train()
if "evaluation_strategy" not in params and "eval_strategy" not in params:
    print(trainer.evaluate())


trainer.model.save_pretrained(OUTPUT_DIR)  
tok.save_pretrained(OUTPUT_DIR)
print(f"[done] LoRA adapter + tokenizer saved to {OUTPUT_DIR}")

if SAVE_MERGED:
    print("[info] Merging LoRA into base weights...")
    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
        device_map="auto"
    )
    peft_model = PeftModel.from_pretrained(base, OUTPUT_DIR)
    merged = peft_model.merge_and_unload()
    merged.save_pretrained(OUTPUT_DIR + "-merged", safe_serialization=True)
    tok.save_pretrained(OUTPUT_DIR + "-merged")
    print(f"[done] Merged full model saved to {OUTPUT_DIR + '-merged'}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

2025-11-08 05:00:46.617440: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762578046.801896      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762578046.853589      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

[info] Loading Multi-News...


default/train/0000.parquet:   0%|          | 0.00/295M [00:00<?, ?B/s]

default/train/0001.parquet:   0%|          | 0.00/28.3M [00:00<?, ?B/s]

default/validation/0000.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

default/test/0000.parquet:   0%|          | 0.00/40.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/44972 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5622 [00:00<?, ? examples/s]

[info] Examples: 4000 train / 400 eval


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map (num_proc=2):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/400 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


[info] tokenized train examples=4000  eval examples=400  ~train tokens=5,612,400


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


[ETA] step 10/100 | 0.01 steps/s | ETA 02:47:33 | elapsed 00:18:37
[ETA] step 20/100 | 0.01 steps/s | ETA 02:26:53 | elapsed 00:36:43
[ETA] step 30/100 | 0.01 steps/s | ETA 02:08:49 | elapsed 00:55:12
[ETA] step 40/100 | 0.01 steps/s | ETA 01:51:29 | elapsed 01:14:19
[ETA] step 50/100 | 0.01 steps/s | ETA 01:33:21 | elapsed 01:33:21
[ETA] step 60/100 | 0.01 steps/s | ETA 01:14:30 | elapsed 01:51:46
[ETA] step 70/100 | 0.01 steps/s | ETA 00:55:57 | elapsed 02:10:33
[ETA] step 80/100 | 0.01 steps/s | ETA 00:37:14 | elapsed 02:28:58
[ETA] step 90/100 | 0.01 steps/s | ETA 00:18:40 | elapsed 02:48:00
[ETA] step 100/100 | 0.01 steps/s | ETA 00:00:00 | elapsed 03:07:28
[ETA] step 100/100 | 0.01 steps/s | ETA 00:00:00 | elapsed 03:07:29
[done] LoRA adapter + tokenizer saved to /kaggle/working/polaris-mn-qlora-qwen3-8b


In [None]:
def run_in_notebook(mode="multinews", split="validation", limit=3):
    if mode == "single":
        print(run_single_article_demo(
            "Sample Title",
            "Officials said the agreement was reached after weeks of negotiation..."
        ))
    elif mode == "multinews":
        ds = load_multinews(split, limit)
        for i in range(len(ds)):
            out = run_multinews_topic_demo(ds[i], topic_hint=f"Multi-News #{i}")
            print(f"\n=== Multi-News topic {i} ===")
            print(out["digest"])
            if out["gold_summary"]:
                print("\n[REF]", out["gold_summary"])
    else:
        sample_articles = [
            {"title":"Climate pact announced", "text":"Officials said the agreement was reached after weeks..."},
            {"title":"Talks lead to climate deal", "text":"Negotiators confirmed a deal on emissions reductions..."},
            {"title":"Local sports team wins", "text":"The city celebrated as the team secured a victory..."},
        ]
        outs = cluster_and_digest_raw_articles(sample_articles, topic_hint="Daily News")
        for o in outs:
            print(f"\n=== Cluster {o['cluster_id']} ===")
            print("TITLES:", o["titles"])
            print("DIGEST:", o["digest"])


In [None]:
run_in_notebook(mode="multinews", split="validation", limit=2)
