In [None]:
import os, sys, json, math, multiprocessing, subprocess, types
from typing import Optional

def _ensure(pkgs):
    to_install = []
    for p in pkgs:
        try:
            __import__(p.split("==")[0].split(">=")[0].replace("-", "_"))
        except Exception:
            to_install.append(p)
    if to_install:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + to_install)

_ensure([
    "torch",
    "transformers>=4.44.0",
    "accelerate>=0.33.0",
    "pandas",
    "scikit-learn>=1.3.0",
])

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    set_seed,
)
from transformers.trainer_utils import EvalPrediction

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("OMP_NUM_THREADS", str(min(8, multiprocessing.cpu_count())))
os.environ.setdefault("MKL_NUM_THREADS", str(min(8, multiprocessing.cpu_count())))
torch.set_num_threads(max(1, multiprocessing.cpu_count() // 2))

MODEL_NAME = "roberta-base"
MAX_LEN = 384
SEED = 42

def _device_setup():
    has_cuda = torch.cuda.is_available()
    bf16_ok = has_cuda and torch.cuda.is_bf16_supported()
    fp16_ok = has_cuda and not bf16_ok
    return has_cuda, bf16_ok, fp16_ok

def load_data(true_csv: str, false_csv: str):
    df_t = pd.read_csv(true_csv)
    df_f = pd.read_csv(false_csv)
    df_t["label"] = 1
    df_f["label"] = 0
    df = pd.concat([df_t, df_f], ignore_index=True)
    need = {"title", "text", "subject", "date", "label"}
    miss = need - set(df.columns)
    if miss:
        raise ValueError(f"Missing columns: {miss}")
    df = df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
    return df

def make_meta_prefix(subject: str, date_str: str):
    subj = (subject or "unknown").strip().replace(" ", "_")[:30]
    yr = "UNK"
    try:
        yr_val = pd.to_datetime(date_str, errors="coerce").year
        yr = "UNK" if pd.isna(yr_val) else str(int(yr_val))
    except Exception:
        pass
    return f"[SUBJ_{subj}] [YEAR_{yr}] "

class NewsDS(Dataset):
    def __init__(self, df: pd.DataFrame, tok, max_len=MAX_LEN):
        self.df, self.tok, self.max_len = df, tok, max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        r = self.df.iloc[i]
        txt = make_meta_prefix(r["subject"], r["date"]) + f"{r['title']} {r['text']}"
        out = self.tok(
            txt,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return {
            "input_ids": out["input_ids"][0],
            "attention_mask": out["attention_mask"][0],
            "labels": torch.tensor(int(r["label"]), dtype=torch.long),
        }

class ResidualAdapter(nn.Module):
    def __init__(self, hidden_size, bottleneck=64):
        super().__init__()
        self.down_layer = nn.Linear(hidden_size, bottleneck)
        self.up_layer = nn.Linear(bottleneck, hidden_size)
        nn.init.constant_(self.up_layer.weight, 0)
        nn.init.constant_(self.up_layer.bias, 0)

    def forward(self, x):
        z = F.relu(self.down_layer(x))
        delta = self.up_layer(z)
        return x + delta

class SparseAdapter(nn.Module):
    def __init__(self, hidden_size, bottleneck=64, topk=128, use_fraction=False):
        super().__init__()
        self.down_layer = nn.Linear(hidden_size, bottleneck)
        self.up_layer = nn.Linear(bottleneck, hidden_size)
        nn.init.constant_(self.up_layer.weight, 0)
        nn.init.constant_(self.up_layer.bias, 0)
        self.gate_network = nn.Linear(hidden_size, 1)
        self.topk = topk
        self.use_fraction = use_fraction

    def sample_gumbel(self, shape, device='cpu', eps=1e-20):
        U = torch.rand(shape, device=device)
        return -torch.log(-torch.log(U + eps) + eps)

    def relaxed_topk_mask(self, logits: torch.Tensor, k: int, tau: float = 1.0):
        if logits.dim() == 1:
            logits = logits.unsqueeze(0)
        device = logits.device
        g = self.sample_gumbel(logits.shape, device=device)
        y = (logits + g) / tau
        probs = F.softmax(y, dim=-1)
        topk_idx = torch.topk(logits, k=k, dim=-1).indices
        hard_mask = torch.zeros_like(logits)
        hard_mask[0, topk_idx[0]] = 1.0
        mask = (hard_mask - probs).detach() + probs
        return mask.squeeze(0)

    def forward(self, x):
        B, T, H = x.shape
        gate_logits = self.gate_network(x).squeeze(-1)

        if self.use_fraction:
            k = max(1, min(T, int(self.topk * T)))
        else:
            k = max(1, min(T, int(self.topk)))

        masks = []
        for i in range(B):
            masks.append(self.relaxed_topk_mask(gate_logits[i], k=k))
        mask = torch.stack(masks, dim=0).unsqueeze(-1)

        delta = self.up_layer(F.relu(self.down_layer(x)))
        return x + delta * mask

    def l1_penalty(self):
        return torch.norm(self.down_layer.weight, 1) + torch.norm(self.up_layer.weight, 1)

def _wrap_roberta_layer_with_adapter(layer, adapter: nn.Module):
    original_forward = layer.forward
    def forward_with_adapter(self, hidden_states, *args, **kwargs):
        outputs = original_forward(hidden_states, *args, **kwargs)
        if isinstance(outputs, torch.Tensor):
            return adapter(outputs)
        elif isinstance(outputs, tuple):
            adapted = adapter(outputs[0])
            return (adapted,) + outputs[1:]
        else:
            return outputs
    layer.forward = types.MethodType(forward_with_adapter, layer)

def inject_adapters_into_roberta(model: nn.Module,
                                 adapter_type: str = "residual",
                                 n_adapter_layers: int = 4,
                                 adapter_bottleneck: int = 64,
                                 sparse_topk: int = 128,
                                 sparse_use_fraction: bool = False):
    """
    Inserts adapters into the LAST n_adapter_layers of the RoBERTa encoder.
    Leaves classifier head trainable; freezes base backbone params.
    """
    for p in model.base_model.parameters():
        p.requires_grad = False

    if hasattr(model, "classifier"):
        for p in model.classifier.parameters():
            p.requires_grad = True

    enc_layers = model.base_model.encoder.layer
    num_layers = len(enc_layers)
    target_idxs = list(range(max(0, num_layers - n_adapter_layers), num_layers))

    hidden_size = model.config.hidden_size
    adapters = nn.ModuleList()
    for i in target_idxs:
        if adapter_type == "sparse":
            adp = SparseAdapter(hidden_size, adapter_bottleneck, topk=sparse_topk, use_fraction=sparse_use_fraction)
        else:
            adp = ResidualAdapter(hidden_size, adapter_bottleneck)
        adapters.append(adp)
        _wrap_roberta_layer_with_adapter(enc_layers[i], adp)

    model.adapter_modules = adapters
    for m in adapters.parameters():
        m.requires_grad = True


    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Adapters injected into layers {target_idxs}. Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
    return model

def build_model(total_steps: int,
                adapter_type: str = "residual",
                n_adapter_layers: int = 4,
                adapter_bottleneck: int = 64,
                sparse_topk: int = 128,
                sparse_use_fraction: bool = False):
    tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    model = inject_adapters_into_roberta(
        model,
        adapter_type=adapter_type,
        n_adapter_layers=n_adapter_layers,
        adapter_bottleneck=adapter_bottleneck,
        sparse_topk=sparse_topk,
        sparse_use_fraction=sparse_use_fraction,
    )
    return tok, model

def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds),
            "f1": f1_score(p.label_ids, preds)}

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_f = torch.nn.CrossEntropyLoss(
            weight=(self.class_weights.to(logits.device) if self.class_weights is not None else None)
        )
        loss = loss_f(logits, labels)
        return (loss, outputs) if return_outputs else loss

def make_training_args(args, workers):
    has_cuda, bf16_ok, fp16_ok = _device_setup()
    base_args = dict(
        output_dir=args.out_dir,
        per_device_train_batch_size=args.bsz,
        per_device_eval_batch_size=args.bsz,
        gradient_accumulation_steps=args.grad_accum,
        num_train_epochs=args.epochs,
        learning_rate=args.lr,
        logging_steps=50,
        seed=SEED,
        remove_unused_columns=False,
        dataloader_num_workers=min(2, workers),
        report_to=[],
    )
    try:
        return TrainingArguments(
            **base_args,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            fp16=fp16_ok,
            bf16=bf16_ok,
        )
    except TypeError:
        print(" Using legacy-safe TrainingArguments (older transformers detected).")
        has_cuda, bf16_ok, fp16_ok = _device_setup()
        return TrainingArguments(
            **base_args,
            fp16=fp16_ok,
            bf16=bf16_ok,
            no_cuda=not has_cuda,
        )

def run_training(
    true_csv: str,
    false_csv: str,
    out_dir: str = "./authcred_roberta_adapters",
    epochs: int = 4,
    bsz: int = 4,
    grad_accum: int = 2,
    lr: float = 2e-4,
    adapter_type: str = "residual",
    n_adapter_layers: int = 4,
    adapter_bottleneck: int = 64,
    sparse_topk: int = 128,
    sparse_use_fraction: bool = False,
):
    class _Args: ...
    args = _Args()
    args.true_csv, args.false_csv = true_csv, false_csv
    args.out_dir, args.epochs, args.bsz, args.grad_accum, args.lr = out_dir, epochs, bsz, grad_accum, lr
    args.adapter_type = adapter_type
    args.n_adapter_layers = n_adapter_layers
    args.adapter_bottleneck = adapter_bottleneck
    args.sparse_topk = sparse_topk
    args.sparse_use_fraction = sparse_use_fraction

    set_seed(SEED)
    os.makedirs(args.out_dir, exist_ok=True)

    df = load_data(args.true_csv, args.false_csv)
    n_train = int(0.9 * len(df))
    train_df = df.iloc[:n_train]
    val_df = df.iloc[n_train:]

    steps_per_epoch = math.ceil(len(train_df) / max(1, (args.bsz * args.grad_accum)))
    total_steps = max(1, steps_per_epoch * args.epochs)
    print(f"Total steps (for logging): {total_steps}")

    tok, model = build_model(
        total_steps,
        adapter_type=args.adapter_type,
        n_adapter_layers=args.n_adapter_layers,
        adapter_bottleneck=args.adapter_bottleneck,
        sparse_topk=args.sparse_topk,
        sparse_use_fraction=args.sparse_use_fraction,
    )
    train_ds, val_ds = NewsDS(train_df, tok), NewsDS(val_df, tok)

    pos = int((train_df["label"] == 1).sum())
    neg = int((train_df["label"] == 0).sum())
    w_pos = 0.5 * (pos + neg) / max(1, pos)
    w_neg = 0.5 * (pos + neg) / max(1, neg)
    class_w = torch.tensor([w_neg, w_pos], dtype=torch.float32)

    workers = min(4, multiprocessing.cpu_count())
    targs = make_training_args(args, workers)

    trainer = WeightedTrainer(
        model=model,
        args=targs,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        tokenizer=tok,
        class_weights=class_w,
    )
    try:
        trainer.label_names = ["labels"]
    except Exception:
        pass

    trainer.train()
    print("Eval:", trainer.evaluate())
    trainer.save_model(args.out_dir)
    tok.save_pretrained(args.out_dir)

def _running_in_notebook():
    try:
        from IPython import get_ipython
        ip = get_ipython()
        return ip is not None and "IPKernelApp" in ip.config
    except Exception:
        return False

def main():
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--true_csv", type=str, required=True)
    ap.add_argument("--false_csv", type=str, required=True)
    ap.add_argument("--out_dir", type=str, default="./authcred_roberta_adapters")
    ap.add_argument("--epochs", type=int, default=4)
    ap.add_argument("--bsz", type=int, default=4)
    ap.add_argument("--grad_accum", type=int, default=2)
    ap.add_argument("--lr", type=float, default=2e-4)
    ap.add_argument("--adapter_type", type=str, default="residual", choices=["residual","sparse"])
    ap.add_argument("--n_adapter_layers", type=int, default=4)
    ap.add_argument("--adapter_bottleneck", type=int, default=64)
    ap.add_argument("--sparse_topk", type=float, default=128)
    ap.add_argument("--sparse_use_fraction", action="store_true")
    args = ap.parse_args()
    run_training(
        true_csv=args.true_csv,
        false_csv=args.false_csv,
        out_dir=args.out_dir,
        epochs=args.epochs,
        bsz=args.bsz,
        grad_accum=args.grad_accum,
        lr=args.lr,
        adapter_type=args.adapter_type,
        n_adapter_layers=args.n_adapter_layers,
        adapter_bottleneck=args.adapter_bottleneck,
        sparse_topk=args.sparse_topk,
        sparse_use_fraction=args.sparse_use_fraction,
    )

if __name__ == "__main__" and not _running_in_notebook():
    main()


In [None]:
run_training(
    true_csv="True.csv",
    false_csv="Fake.csv",
    out_dir="/content/authcred_roberta",
    epochs=3,
    bsz=8,
    grad_accum=2,
    lr=2e-4,
)


Total steps (for logging): 7578


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Adapters injected into layers [8, 9, 10, 11]. Trainable params: 988,674 / 125,043,714 (0.79%)
⚠️ Using legacy-safe TrainingArguments (older transformers detected).


  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.442
100,0.0557
150,0.0001
200,0.0262
250,0.0037
300,0.0034
350,0.002
400,0.0
450,0.0
500,0.0


Eval: {'eval_loss': 5.996745585434837e-06, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 141.5184, 'eval_samples_per_second': 31.727, 'eval_steps_per_second': 3.971, 'epoch': 3.0}


In [None]:
!zip -r /content/authcred_roberta.zip /content/authcred_roberta

  adding: content/authcred_roberta/ (stored 0%)
  adding: content/authcred_roberta/checkpoint-1500/ (stored 0%)
  adding: content/authcred_roberta/checkpoint-1500/tokenizer.json (deflated 82%)
  adding: content/authcred_roberta/checkpoint-1500/merges.txt (deflated 53%)
  adding: content/authcred_roberta/checkpoint-1500/rng_state.pth (deflated 26%)
  adding: content/authcred_roberta/checkpoint-1500/config.json (deflated 49%)
  adding: content/authcred_roberta/checkpoint-1500/scheduler.pt (deflated 61%)
  adding: content/authcred_roberta/checkpoint-1500/model.safetensors (deflated 41%)
  adding: content/authcred_roberta/checkpoint-1500/optimizer.pt (deflated 7%)
  adding: content/authcred_roberta/checkpoint-1500/tokenizer_config.json (deflated 75%)
  adding: content/authcred_roberta/checkpoint-1500/special_tokens_map.json (deflated 52%)
  adding: content/authcred_roberta/checkpoint-1500/trainer_state.json (deflated 74%)
  adding: content/authcred_roberta/checkpoint-1500/training_args.bin