In [1]:
# ===============================
# 🔧 Setup: Install Packages
# ===============================
!pip install -q \
  "transformers>=4.41,<5" \
  "datasets==2.19.1" \
  "peft==0.10.0" \
  "accelerate>=0.34.2" \
  "bitsandbytes>=0.43.3" \
  "evaluate>=0.4.2" \
  "rouge_score>=0.1.2" \
  "scikit-learn" \
  "openpyxl" \
  "pandas" \
  "numpy" \
  "sentencepiece" \
  "pyarrow>=15" \
  "XlsxWriter"

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.3/172.3 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the follow

In [None]:
import torch, sys, subprocess
mm = ".".join(torch.__version__.split(".")[:2])
triton_by_torch = {"2.5":"3.2.0","2.4":"3.0.0","2.3":"2.3.1","2.2":"2.2.0"}
target = triton_by_torch.get(mm, "3.2.0")
print(f"Torch {torch.__version__} - Installing Triton {target}")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"triton=={target}"])

Torch 2.8.0+cu126 → Installing Triton 3.2.0


0

In [None]:
# ===============================
# Import packages & login
# ===============================
from google.colab import drive
drive.mount('/content/drive')

import os, random, torch, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    TrainingArguments, Trainer, set_seed
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from huggingface_hub import login

# --------------- Hugging Face token ---------------
os.environ["HF_TOKEN"] = "YOUR_TOKEN_HERE"
login(os.environ["HF_TOKEN"])

# --------------- Reproducibility ---------------
set_seed(42)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
# =========================================================
# Perplexity on HF domains + PG-19 (parquet)
#    • FINET - clear VRAM - BASE
#    • Same (BASE) tokenizer for both - fair PPL
#    • Adaptive batch size (OOM backoff)
#    • Streaming policy:
#         - streaming=True  - PG-19 (large)
#         - streaming=False - WikiText-103 (doc-level), DailyDialog, CNN/DailyMail, ArXiv, PubMed
#    • Skip-on-failure per domain
#    • Per-document Excel sheets (doc_id, tokens, ppl, snippet)
#    • TQDM progress like: FT | pg19_full: 100% 100/100 [43:20<00:00, 19.36s/doc]
# =========================================================
import os, math, gc, re
from datetime import datetime

import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd
from tqdm.auto import tqdm

# ---------- Paths & constants (edit FT path if needed) ----------
BASE_PATH = r"/content/drive/My Drive/associations-ANLP"
BASE_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"
FT_MODEL   = os.path.join(BASE_PATH, r"full_llama3_8b_system_prompt_lora_SFT_SWOW_tgt_qkvo_tr7194c_val899c_r16_a32_do0p1_lr0.0001_bs16_ga4/merged_model")

SAVE_DIR = os.path.join(BASE_PATH, r"data/models_perplexity")
os.makedirs(SAVE_DIR, exist_ok=True)
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
SAVE_XLSX = os.path.join(SAVE_DIR, f"bench_ppl_{RUN_TAG}.xlsx")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
USE_4BIT = (DEVICE == "cuda")

# Context & batching
CTX_LEN   = 2048
STRIDE    = 1024
INIT_BATCH_SIZE = 32
MAX_DOCS_PER_DOMAIN = None  # None = ALL available docs in each split

DOMAINS = [
    "WikiText-103 (doc-level)",
    "ArXiv",
    "PubMed Abstracts",
    "DailyDialog",
    "CNN/DailyMail (articles)",
    "PG-19",
]

# Short labels for progress bars / doc_ids
DOM_LABELS = {
    "WikiText-103 (doc-level)": "wt103",
    "ArXiv": "arxiv",
    "PubMed Abstracts": "pubmed",
    "DailyDialog": "dailydialog",
    "CNN/DailyMail (articles)": "cnndm",
    "PG-19": "pg19_full",
}

# ---------- Tokenizer (BASE for both models) ----------
quant_cfg = BitsAndBytesConfig(load_in_4bit=True) if USE_4BIT else None
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tok.pad_token is None and tok.eos_token is not None:
    tok.pad_token = tok.eos_token
tok.padding_side = "left"

def load_model(path_or_id):
    kwargs = dict(device_map="auto")
    if USE_4BIT:
        kwargs["quantization_config"] = quant_cfg
    m = AutoModelForCausalLM.from_pretrained(path_or_id, **kwargs)
    m.eval()
    return m

# ---------- Sliding-window chunker ----------
def token_windows_for_doc(text, tokenizer, max_len=CTX_LEN, stride=STRIDE):
    ids = tokenizer(text, return_tensors=None, add_special_tokens=False)["input_ids"]
    start = 0
    while start < len(ids):
        end = min(start + max_len, len(ids))
        chunk = ids[start:end]
        tgt_len = min(len(chunk), stride)
        yield (chunk, tgt_len)
        if end == len(ids): break
        start = end - stride

def iter_domain_windows_from_iter(doc_iter, tokenizer):
    for t in doc_iter:
        if not t: continue
        yield from token_windows_for_doc(t, tokenizer)

# ---------- Adaptive-batch perplexity (many windows) ----------
def batched_ppl(model, tokenizer, text_iterable, init_batch_size=INIT_BATCH_SIZE, desc="Scoring"):
    total_nll, total_toks = 0.0, 0
    buf, batch_size = [], init_batch_size
    pbar = tqdm(total=0, unit="win", desc=desc)

    def flush(current_batch):
        nonlocal total_nll, total_toks
        if not current_batch:
            return True
        try:
            input_ids = torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(x[0]) for x in current_batch],
                batch_first=True,
                padding_value=tok.eos_token_id if tok.eos_token_id is not None else 0
            ).to(model.device)
            labels = torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(x[1]) for x in current_batch],
                batch_first=True,
                padding_value=-100
            ).to(model.device)

            with torch.no_grad():
                out = model(input_ids=input_ids, labels=labels)
                n_pos = (labels != -100).sum().item()
                total_nll += out.loss.item() * n_pos
                total_toks += n_pos

            pbar.update(len(current_batch))
            return True
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                return False
            raise

    def process_batch():
        nonlocal buf, batch_size
        while buf:
            current = buf if len(buf) < batch_size else buf[:batch_size]
            success = flush(current)
            if success:
                buf = buf[len(current):]
            else:
                batch_size = max(1, batch_size // 2)
                pbar.write(f"⚠️ OOM - reducing batch_size to {batch_size}")
                if batch_size == 1:
                    flush(buf[:1])
                    buf = buf[1:]

    for chunk_ids, tgt_len in text_iterable:
        if tgt_len <= 0:
            continue
        labels = [-100] * (len(chunk_ids) - tgt_len) + list(chunk_ids[-tgt_len:])
        buf.append((chunk_ids, labels))
        if len(buf) >= batch_size:
            process_batch()
    process_batch()
    pbar.close()

    ppl = math.exp(total_nll / max(1, total_toks)) if total_toks else float("nan")
    return ppl, total_toks

# ---------- Per-document PPL (quiet) ----------
def ppl_for_one_doc(model, tokenizer, text, init_batch_size=INIT_BATCH_SIZE):
    total_nll, total_toks = 0.0, 0
    buf, batch_size = [], init_batch_size

    def flush(current_batch):
        nonlocal total_nll, total_toks
        if not current_batch:
            return True
        try:
            input_ids = torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(x[0]) for x in current_batch],
                batch_first=True,
                padding_value=tok.eos_token_id if tok.eos_token_id is not None else 0
            ).to(model.device)
            labels = torch.nn.utils.rnn.pad_sequence(
                [torch.tensor(x[1]) for x in current_batch],
                batch_first=True,
                padding_value=-100
            ).to(model.device)
            with torch.no_grad():
                out = model(input_ids=input_ids, labels=labels)
                n_pos = (labels != -100).sum().item()
                total_nll += out.loss.item() * n_pos
                total_toks += n_pos
            return True
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                return False
            raise

    for chunk_ids, tgt_len in token_windows_for_doc(text, tokenizer):
        if tgt_len <= 0:
            continue
        labels = [-100] * (len(chunk_ids) - tgt_len) + list(chunk_ids[-tgt_len:])
        buf.append((chunk_ids, labels))
        if len(buf) >= batch_size:
            if not flush(buf[:batch_size]):
                batch_size = max(1, batch_size // 2)
            else:
                buf = buf[batch_size:]
    while buf:
        if not flush(buf[:batch_size]):
            batch_size = max(1, batch_size // 2)
        else:
            buf = buf[batch_size:]

    ppl = math.exp(total_nll / max(1, total_toks)) if total_toks else float("nan")
    return ppl, total_toks

# ---------- Domain generators (streaming policy applied) ----------
def make_domain_generators(max_docs=None):
    def take_limit(gen, n):
        if n is None:
            for x in gen: yield x
        else:
            c = 0
            for x in gen:
                if c >= n: break
                yield x
                c += 1

    gens = {}

    # WikiText-103 (doc-level) — TEST, page-level text - streaming=False
    def wt103_gen():
        ds = load_dataset("EleutherAI/wikitext_document_level", "wikitext-103-v1",
                          split="test", streaming=False)
        def it():
            count = 0
            for row in ds:
                # IMPORTANT: field is "page" (not "text")
                txt = row.get("page") or row.get("text")
                if txt:
                    yield txt
                    count += 1
                    if max_docs is not None and count >= max_docs: break
        return it()
    gens["WikiText-103 (doc-level)"] = wt103_gen

    # ArXiv — scientific_papers/arxiv (test-validation abstracts) - streaming=False
    def arxiv_gen():
        try:
            ds = load_dataset("scientific_papers", "arxiv", split="test",
                              streaming=False, trust_remote_code=True)
        except Exception:
            ds = load_dataset("scientific_papers", "arxiv", split="validation",
                              streaming=False, trust_remote_code=True)
        def it():
            count = 0
            for row in ds:
                a = row.get("abstract")
                if a:
                    yield a
                    count += 1
                    if max_docs is not None and count >= max_docs: break
        return it()
    gens["ArXiv"] = arxiv_gen

    # PubMed — ccdv/pubmed-summarization (document/section; test/validation) - streaming=False
    def pubmed_gen():
        for cfg in ("document", "section"):
            for split in ("test", "validation"):
                try:
                    ds = load_dataset("ccdv/pubmed-summarization", cfg, split=split, streaming=False)
                    field = "abstract" if "abstract" in ds.features else ("target" if "target" in ds.features else None)
                    if field:
                        def it():
                            count = 0
                            for row in ds:
                                t = row.get(field)
                                if t:
                                    yield t
                                    count += 1
                                    if max_docs is not None and count >= max_docs: break
                        return it()
                except Exception:
                    continue
        raise RuntimeError("No usable PubMed split/config found")
    gens["PubMed Abstracts"] = pubmed_gen

    # DailyDialog — TEST - streaming=False
    def dailydialog_gen():
        try:
            ds = load_dataset("daily_dialog", split="test", streaming=False, revision="refs/convert/parquet")
        except Exception:
            ds = load_dataset("daily_dialog", split="test", streaming=False)
        def it():
            count = 0
            for row in ds:
                dlg = row.get("dialog")
                if dlg and isinstance(dlg, list):
                    yield " ".join(dlg)
                    count += 1
                    if max_docs is not None and count >= max_docs: break
        return it()
    gens["DailyDialog"] = dailydialog_gen

    # CNN/DailyMail — TEST (articles) - streaming=False
    def cnndm_gen():
        ds = load_dataset("cnn_dailymail", "3.0.0", split="test", streaming=False)
        def it():
            count = 0
            for row in ds:
                art = row.get("article")
                if art:
                    yield art
                    count += 1
                    if max_docs is not None and count >= max_docs: break
        return it()
    gens["CNN/DailyMail (articles)"] = cnndm_gen

    # PG-19 — Parquet mirror TEST - streaming=True
    def pg19_gen():
        ds = load_dataset("emozilla/pg19", split="test", revision="refs/convert/parquet", streaming=True)
        field = "text" if "text" in getattr(ds, "features", {}) else "book_text"
        return (row.get(field) for row in ds if row.get(field)) if max_docs is None \
               else take_limit((row.get(field) for row in ds if row.get(field)), max_docs)
    gens["PG-19"] = pg19_gen

    return gens

print("Loading HF-hosted domain equivalents (streaming policy applied)…")
domain_generators = make_domain_generators(MAX_DOCS_PER_DOMAIN)

# Filter only domains whose generators can be instantiated
available_domains = []
for d, fn in list(domain_generators.items()):
    try:
        _ = fn()  # ensure callable doesn’t raise
        available_domains.append(d)
    except Exception as e:
        print(f"⚠️ Skipping domain '{d}' due to init error: {e}")

domains = [d for d in DOMAINS if d in available_domains]
print("Domains:", domains)

# Helper: estimate doc count for progress bars (to show ETA)
def estimate_total_docs(domain, max_docs):
    if max_docs is not None:
        return max_docs
    try:
        if domain == "WikiText-103 (doc-level)":
            return len(load_dataset("EleutherAI/wikitext_document_level", "wikitext-103-v1",
                                    split="test", streaming=False))
        if domain == "DailyDialog":
            return len(load_dataset("daily_dialog", split="test", streaming=False))
        if domain == "CNN/DailyMail (articles)":
            return len(load_dataset("cnn_dailymail", "3.0.0", split="test", streaming=False))
        if domain == "ArXiv":
            try:
                return len(load_dataset("scientific_papers", "arxiv", split="test", streaming=False, trust_remote_code=True))
            except Exception:
                return len(load_dataset("scientific_papers", "arxiv", split="validation", streaming=False, trust_remote_code=True))
        if domain == "PubMed Abstracts":
            for cfg in ("document", "section"):
                for split in ("test", "validation"):
                    try:
                        ds = load_dataset("ccdv/pubmed-summarization", cfg, split=split, streaming=False)
                        field = "abstract" if "abstract" in ds.features else ("target" if "target" in ds.features else None)
                        if field:
                            return len(ds)
                    except Exception:
                        continue
        if domain == "PG-19":
            return 100  # PG-19 test has 100 books
    except Exception:
        return None
    return None

# ---------- Helper to run one model over all domains ----------
def run_one_model(model_id_or_path, tag):
    print(f"\nLoading model: {tag} - {model_id_or_path}")
    model = load_model(model_id_or_path)
    results = {}
    per_domain_docs = {}

    for domain in domains:
        short = DOM_LABELS.get(domain, domain.lower().replace(" ", "_"))
        total_docs = estimate_total_docs(domain, MAX_DOCS_PER_DOMAIN)
        pbar = tqdm(total=total_docs, unit="doc", desc=f"{tag} | {short}", leave=True)

        try:
            doc_iter = domain_generators[domain]()  # fresh generator per model

            doc_rows = []
            total_nll, total_toks = 0.0, 0
            processed = 0
            for i, text in enumerate(doc_iter, start=1):
                if not text:
                    continue
                ppl, toks = ppl_for_one_doc(model, tok, text, init_batch_size=INIT_BATCH_SIZE)
                if toks > 0 and math.isfinite(ppl):
                    total_nll += math.log(ppl) * toks
                    total_toks += toks

                snippet = str(text).replace("\n", " ")[:200]
                doc_id = f"{short}_{i:05d}"
                doc_rows.append({
                    "domain": domain,
                    "doc_id": doc_id,
                    "tokens_scored": int(toks),
                    "ppl": float(ppl) if math.isfinite(ppl) else float("nan"),
                    "snippet": snippet
                })
                processed += 1
                pbar.update(1)
                if MAX_DOCS_PER_DOMAIN is not None and processed >= MAX_DOCS_PER_DOMAIN:
                    break

            corpus_ppl = math.exp(total_nll / max(1, total_toks)) if total_toks else float("nan")
            results[domain] = (corpus_ppl, total_toks)
            per_domain_docs[domain] = pd.DataFrame(doc_rows)

            print(f"{tag:>5} - {domain}: PPL {corpus_ppl:.4f} | tokens {total_toks}")

        except Exception as e:
            print(f"⚠️ {tag} | Domain '{domain}' failed: {e}")
            results[domain] = (float("nan"), 0)
            per_domain_docs[domain] = pd.DataFrame(columns=["domain","doc_id","tokens_scored","ppl","snippet"])
        finally:
            pbar.close()

    # cleanup VRAM
    del model
    gc.collect()
    if DEVICE == "cuda":
        torch.cuda.empty_cache()

    return results, per_domain_docs

# ---------- Run FINET first, then BASE ----------
ft_results,   ft_docs   = run_one_model(FT_MODEL,   "FINET")
base_results, base_docs = run_one_model(BASE_MODEL, "BASE")

# ---------- Merge & save ----------
rows = []
per_domain_tables = {}
token_mismatches = []

for domain in domains:
    base_ppl, base_tok = base_results.get(domain, (float("nan"), 0))
    ft_ppl,   ft_tok   = ft_results.get(domain, (float("nan"), 0))

    # canonical token count (they should be the same)
    num_tokens = ft_tok if ft_tok else base_tok

    # metrics
    delta = ft_ppl - base_ppl if (math.isfinite(ft_ppl) and math.isfinite(base_ppl)) else float("nan")
    pct   = (delta / base_ppl * 100.0) if (math.isfinite(delta) and base_ppl and math.isfinite(base_ppl)) else float("nan")

    # new SUMMARY row schema
    rows.append({
        "domain": domain,
        "base_ppl": base_ppl,
        "ft_ppl": ft_ppl,
        "num_tokens": num_tokens,
        "delta": delta,
        "pct_change_%": pct
    })

    # small per-domain debug table
    per_domain_tables[domain] = pd.DataFrame([
        {"model": "BASE", "ppl": base_ppl, "tokens_scored": base_tok},
        {"model": "FINET", "ppl": ft_ppl,  "tokens_scored": ft_tok},
        {"model": "DELTA (FT-BASE)", "ppl": delta, "tokens_scored": None},
        {"model": "% change (FT vs BASE)", "ppl": pct, "tokens_scored": None},
    ])

    if base_tok != ft_tok:
        token_mismatches.append((domain, base_tok, ft_tok))

# enforce column order explicitly
summary_df = pd.DataFrame(rows, columns=[
    "domain", "base_ppl", "ft_ppl", "num_tokens", "delta", "pct_change_%",
]).sort_values("domain")

print("\n=== SUMMARY (lower is better) ===")
for _, r in summary_df.iterrows():
    b = r['base_ppl']; f = r['ft_ppl']; d = r['delta']; p = r['pct_change_%']
    b_s = f"{b:.4f}" if math.isfinite(b) else "NaN"
    f_s = f"{f:.4f}" if math.isfinite(f) else "NaN"
    d_s = f"{d:+.4f}" if math.isfinite(d) else "NaN"
    p_s = f"{p:+.2f}%" if math.isfinite(p) else "NaN"
    print(f"{r['domain']:<26}  BASE {b_s} | FT {f_s} | Δ {d_s} ({p_s}) | tokens {int(r['num_tokens'])}")

if token_mismatches:
    print("\nToken-count mismatch detected (BASE vs FINET). Expected equal:")
    for dom, bt, ft in token_mismatches:
        print(f"   - {dom}: base_tokens={bt}, ft_tokens={ft}")

# ---------- Write Excel (sheet-name sanitizer) ----------
INVALID_SHEET_CHARS = re.compile(r"[\[\]\:\*\?\/\\]")

def sanitize_sheet_name(name: str) -> str:
    s = INVALID_SHEET_CHARS.sub("-", name).strip()
    if s.endswith("'"): s = s[:-1]
    return s or "Sheet"

used_sheet_names = set()
def make_sheet_name(base: str, suffix: str = "") -> str:
    base_clean = sanitize_sheet_name(base)
    max_base_len = 31 - len(suffix)
    if max_base_len < 1:
        suffix = suffix[:5]
        max_base_len = max(1, 31 - len(suffix))
    name = (base_clean[:max_base_len] + suffix)[:31]
    if name in used_sheet_names:
        i = 1
        while True:
            extra = f"_{i}"
            cut = 31 - len(suffix) - len(extra)
            candidate = (base_clean[:max(1, cut)] + suffix + extra)[:31]
            if candidate not in used_sheet_names:
                name = candidate
                break
            i += 1
    used_sheet_names.add(name)
    return name

with pd.ExcelWriter(SAVE_XLSX, engine="xlsxwriter") as writer:
    summary_df.to_excel(writer, index=False, sheet_name=make_sheet_name("SUMMARY"))
    for domain in domains:
        per_domain_tables[domain].to_excel(writer, index=False, sheet_name=make_sheet_name(domain))
        ft_docs.get(domain, pd.DataFrame()).to_excel(writer, index=False, sheet_name=make_sheet_name(domain, "_FT"))
        base_docs.get(domain, pd.DataFrame()).to_excel(writer, index=False, sheet_name=make_sheet_name(domain, "_BASE"))

print(f"\nExcel saved to: {SAVE_XLSX}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Loading HF-hosted domain equivalents (streaming policy applied)…


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/302M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/635k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/706k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29444 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/60 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/62 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/208M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/207M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/205M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/210M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/208M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/119924 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6633 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6658 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/3.61M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/334k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/331k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Domains: ['WikiText-103 (doc-level)', 'ArXiv', 'PubMed Abstracts', 'DailyDialog', 'CNN/DailyMail (articles)', 'PG-19']

Loading model: FINET → /content/drive/My Drive/ANLP_project/full_llama3_8b_system_prompt_lora_SFT_SWOW_tgt_qkvo_tr7194c_val899c_r16_a32_do0p1_lr0.0001_bs16_ga4/merged_model




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

FINET | wt103:   0%|          | 0/62 [00:00<?, ?doc/s]

FINET → WikiText-103 (doc-level): PPL 7.7670 | tokens 256497


FINET | arxiv:   0%|          | 0/6440 [00:00<?, ?doc/s]

FINET → ArXiv: PPL 19.2511 | tokens 1291152


FINET | pubmed:   0%|          | 0/6658 [00:00<?, ?doc/s]

FINET → PubMed Abstracts: PPL 10.8918 | tokens 1772701


Using the latest cached version of the dataset since daily_dialog couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/daily_dialog/default/0.0.0/469b7118f52336674f1d78693797e9fcc7e47d2a (last modified on Thu Aug 28 18:46:16 2025).


FINET | dailydialog:   0%|          | 0/1000 [00:00<?, ?doc/s]

FINET → DailyDialog: PPL 14.8536 | tokens 120200


FINET | cnndm:   0%|          | 0/11490 [00:00<?, ?doc/s]

FINET → CNN/DailyMail (articles): PPL 10.2025 | tokens 8705750


FINET | pg19_full:   0%|          | 0/100 [00:00<?, ?doc/s]

FINET → PG-19: PPL 11.7694 | tokens 9815040

Loading model: BASE → meta-llama/Meta-Llama-3-8B-Instruct


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

BASE | wt103:   0%|          | 0/62 [00:00<?, ?doc/s]

 BASE → WikiText-103 (doc-level): PPL 7.9970 | tokens 256497


BASE | arxiv:   0%|          | 0/6440 [00:00<?, ?doc/s]

 BASE → ArXiv: PPL 20.0485 | tokens 1291152


BASE | pubmed:   0%|          | 0/6658 [00:00<?, ?doc/s]

 BASE → PubMed Abstracts: PPL 11.1377 | tokens 1772701


Using the latest cached version of the dataset since daily_dialog couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/daily_dialog/default/0.0.0/469b7118f52336674f1d78693797e9fcc7e47d2a (last modified on Thu Aug 28 18:46:16 2025).


BASE | dailydialog:   0%|          | 0/1000 [00:00<?, ?doc/s]

 BASE → DailyDialog: PPL 15.5382 | tokens 120200


BASE | cnndm:   0%|          | 0/11490 [00:00<?, ?doc/s]

 BASE → CNN/DailyMail (articles): PPL 10.4911 | tokens 8705750


BASE | pg19_full:   0%|          | 0/100 [00:00<?, ?doc/s]

 BASE → PG-19: PPL 11.8249 | tokens 9815040

=== SUMMARY (lower is better) ===
ArXiv                       BASE 20.0485 | FT 19.2511 | Δ -0.7975 (-3.98%) | tokens 1291152
CNN/DailyMail (articles)    BASE 10.4911 | FT 10.2025 | Δ -0.2886 (-2.75%) | tokens 8705750
DailyDialog                 BASE 15.5382 | FT 14.8536 | Δ -0.6846 (-4.41%) | tokens 120200
PG-19                       BASE 11.8249 | FT 11.7694 | Δ -0.0555 (-0.47%) | tokens 9815040
PubMed Abstracts            BASE 11.1377 | FT 10.8918 | Δ -0.2459 (-2.21%) | tokens 1772701
WikiText-103 (doc-level)    BASE 7.9970 | FT 7.7670 | Δ -0.2300 (-2.88%) | tokens 256497

✅ Excel saved to: /content/drive/My Drive/ANLP_project/data/models_perplexity/bench_ppl_20250828_183551.xlsx


In [3]:
# Disconnect the runtime
from google.colab import runtime
runtime.unassign()