In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import torch
import numpy as np
from sklearn.metrics import (
    f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, fbeta_score
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # Binary/multiclass metrics
    accuracy = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    weighted_f1 = f1_score(labels, preds, average="weighted")
    weighted_precision = precision_score(labels, preds, average="weighted")
    weighted_recall = recall_score(labels, preds, average="weighted")

    # Class-specific metrics (assuming labels: 0 = No Hate, 1 = Hate)
    f1_hate = f1_score(labels, preds, pos_label=1)
    f1_no_hate = f1_score(labels, preds, pos_label=0)
    precision_hate = precision_score(labels, preds, pos_label=1)
    recall_hate = recall_score(labels, preds, pos_label=1)

    # F0.5 and F2 scores
    f05 = fbeta_score(labels, preds, average="binary", beta=0.5)
    f2 = fbeta_score(labels, preds, average="binary", beta=2)

    # AUC (only works if binary classification & probs available)
    try:
        probs = logits[:, 1]  # take score for class 1
        auc = roc_auc_score(labels, probs)
    except Exception:
        auc = float("nan")

    return {
        "accuracy": accuracy,
        "macro_f1": macro_f1,
        "f1_hate": f1_hate,
        "f1_no_hate": f1_no_hate,
        "f0.5": f05,
        "f2": f2,
        "auc": auc,
        "precision_hate": precision_hate,
        "recall_hate": recall_hate,
        "weighted_f1": weighted_f1,
        "weighted_precision": weighted_precision,
        "weighted_recall": weighted_recall,
    }

def train_deberta(aug, data_source):

    dataset = load_dataset("csv", keep_in_memory=True, data_files={
        "train": f"{aug}/{data_source}/train.csv",
        "validation": f"{aug}/{data_source}/train.csv",
    })

    # Load tokenizer and model
    checkpoint = "microsoft/deberta-v3-base"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    # model.to("cuda")
    # Tokenize
    def preprocess(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

    encoded = dataset.map(preprocess, batched=True)

    import numpy as np
    from sklearn.metrics import (
        f1_score, precision_score, recall_score,
        roc_auc_score, accuracy_score, fbeta_score
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./deberta_{data_source}_{aug}_output",
        learning_rate=2e-5,
        per_device_train_batch_size=48,
        gradient_accumulation_steps=1,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_strategy="steps",
        logging_steps=50,
        logging_first_step=True,
        log_level="info",
        disable_tqdm=False, 
        load_best_model_at_end=True,
        save_total_limit=3,
        fp16=True,   # if you have a GPU with mixed precision
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded["train"],
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
    )

    try:
        trainer.train(resume_from_checkpoint=True)
    except:
        trainer.train()


In [None]:
train_deberta("tda", "corpus")

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer
import os
import re

def get_latest_checkpoint(folder_path: str) -> str | None:
    """
    Returns the path to the checkpoint with the largest number
    in the format checkpoint-* inside the given folder.
    If no checkpoints are found, returns None.
    """
    pattern = re.compile(r"^checkpoint-(\d+)$")
    checkpoints = []

    for name in os.listdir(folder_path):
        match = pattern.match(name)
        if match:
            checkpoints.append((int(match.group(1)), name))

    if not checkpoints:
        return None

    # Find max by checkpoint number
    _, latest = max(checkpoints, key=lambda x: x[0])
    return os.path.join(folder_path, latest)

def perform_deberta_inference(aug, data_source):
    # Load model from a specific checkpoint
    
    checkpoint_path = get_latest_checkpoint(f"deberta_{data_source}_{aug}_output")
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
    dataset = load_dataset("csv", data_files={"train":f"{aug}/{data_source}/train.csv","test":f"{aug}/{data_source}/test.csv"})
    # Tokenize
    def preprocess(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)
    encoded = dataset.map(preprocess, batched=True)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./deberta_{data_source}_{aug}_output",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        per_device_eval_batch_size=32,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=50,
        load_best_model_at_end=True,
        fp16=True,   # if you have a GPU with mixed precision
    )

    # Recreate trainer with this model
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=encoded["test"],
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Runs evaluation loop with compute_metrics
    test_metrics = trainer.evaluate(eval_dataset=dataset["test"])
    return test_metrics

In [3]:
import os
import re
import gc
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import numpy as np
from sklearn.metrics import (
    f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, fbeta_score
)
import evaluate
import torch

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # Binary/multiclass metrics
    accuracy = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    weighted_f1 = f1_score(labels, preds, average="weighted")
    weighted_precision = precision_score(labels, preds, average="weighted")
    weighted_recall = recall_score(labels, preds, average="weighted")

    # Class-specific metrics (assuming labels: 0 = No Hate, 1 = Hate)
    f1_hate = f1_score(labels, preds, pos_label=1)
    f1_no_hate = f1_score(labels, preds, pos_label=0)
    precision_hate = precision_score(labels, preds, pos_label=1)
    recall_hate = recall_score(labels, preds, pos_label=1)

    # F0.5 and F2 scores
    f05 = fbeta_score(labels, preds, average="binary", beta=0.5)
    f2 = fbeta_score(labels, preds, average="binary", beta=2)

    # AUC (only works if binary classification & probs available)
    try:
        probs = logits[:, 1]  # take score for class 1
        auc = roc_auc_score(labels, probs)
    except Exception:
        auc = float("nan")

    return {
        "accuracy": accuracy,
        "macro_f1": macro_f1,
        "f1_hate": f1_hate,
        "f1_no_hate": f1_no_hate,
        "f0.5": f05,
        "f2": f2,
        "auc": auc,
        "precision_hate": precision_hate,
        "recall_hate": recall_hate,
        "weighted_f1": weighted_f1,
        "weighted_precision": weighted_precision,
        "weighted_recall": weighted_recall,
    }

def get_latest_checkpoint(folder_path: str) -> str | None:
    """
    Return path to the highest-numbered 'checkpoint-*' dir in folder_path, or None if none exist.
    """
    if not os.path.isdir(folder_path):
        return None
    pattern = re.compile(r"^checkpoint-(\d+)$")
    nums_and_names = []
    for name in os.listdir(folder_path):
        m = pattern.match(name)
        if m:
            nums_and_names.append((int(m.group(1)), name))
    if not nums_and_names:
        return None
    _, latest = max(nums_and_names, key=lambda x: x[0])
    return os.path.join(folder_path, latest)

def perform_deberta_inference(aug: str, data_source: str, compute_metrics=None):
    # --- Load model (prefer the latest checkpoint if present) ---
    output_dir = f"deberta_{data_source}_{aug}_output"
    ckpt_path = get_latest_checkpoint(output_dir) or output_dir  # fall back to base dir
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base", use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(ckpt_path)
    if aug == "smote":
        aug = "original"
    elif aug == "pos_smote":
        aug = "pos_tagging"
    
    # --- Load data ---
    dataset = load_dataset(
        "csv",
        data_files={
            "train": f"original/{data_source}/train.csv",
            "test":  f"original/{data_source}/test.csv",
        }
    )

    # --- Tokenize (dynamic padding; keep labels) ---
    def preprocess(batch):
        return tokenizer(batch["text"], truncation=True)

    encoded = dataset.map(preprocess, batched=True)

    # If your CSV has 'label' (singular), HF Trainer handles it.
    # If it's named differently, rename here:
    # encoded = encoded.rename_column("your_label_col", "label")

    data_collator = DataCollatorWithPadding(tokenizer)

    # --- Args (note: it's evaluation_strategy) ---
    args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_eval_batch_size=4,
        fp16=True,
        logging_dir="./logs",
        logging_steps=50,
        # Below only matter if you train again; harmless for evaluate():
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        num_train_epochs=10,
        weight_decay=0.01,
        load_best_model_at_end=True,
    )

    # --- Trainer (wire the *tokenized* eval set, tokenizer, and collator) ---
    trainer = Trainer(
        model=model,
        args=args,
        eval_dataset=encoded["test"],     # <- tokenized split
        processing_class=tokenizer,              # <- so Trainer knows how to pad
        data_collator=data_collator,      # <- dynamic padding
        compute_metrics=compute_metrics,  # optional
    )

    # Use the default eval_dataset already set above
    test_metrics = trainer.evaluate()
    # Explicit cleanup
    del trainer, model, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

    return test_metrics


In [None]:
import warnings
import json
warnings.filterwarnings('ignore') 

import os
os.environ["WANDB_MODE"] = "disabled"
result = perform_deberta_inference("tda", "corpus", compute_metrics)

In [None]:
result

In [3]:
import warnings
import json
warnings.filterwarnings('ignore') 

import os
os.environ["WANDB_MODE"] = "disabled"

for aug in ["original", "pos_tagging", "smote", "pos_smote"]:
    for dataset in ["corpus", "stormfront", "reddit_gab", "unified"]:
        result = perform_deberta_inference(aug, dataset, compute_metrics)
        with open("unweighted_results.jsonl", "a", encoding="utf-8") as f:
            f.write(json.dumps({f"{dataset}_{aug}": result}, ensure_ascii=False) + "\n")

NameError: name 'perform_deberta_inference' is not defined

In [None]:
import json

# Write dict to JSON file
with open("unweighted_results.json", "a", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

In [2]:
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer,
)
from sklearn.metrics import classification_report

# ---- same compute_metrics you already defined ----
# def compute_metrics(eval_pred): ...

def evaluate_deberta(aug: str, data_source: str, ckpt_dir: str | None = None):
    """
    Evaluate on {aug}/{data_source}/test.csv using your saved model.
    - ckpt_dir: path to the folder you saved (defaults to train output dir).
    - Writes metrics to stdout and a CSV with per-row predictions.
    """
    test_csv = f"original/{data_source}/test.csv"
    out_dir  = ckpt_dir or f"deberta_{data_source}_{aug}_output"
    assert os.path.exists(out_dir), f"Checkpoint folder not found: {out_dir}"
    assert os.path.exists(test_csv), f"Test CSV not found: {test_csv}"

    # ---- Load tokenizer & model from your saved folder ----
    tokenizer = AutoTokenizer.from_pretrained(out_dir, use_fast=True)
    tokenizer.model_max_length = 256
    model = AutoModelForSequenceClassification.from_pretrained(
        out_dir,
        torch_dtype=torch.float16 if torch.cuda.is_available() else None,
    )

    # ---- Build test dataset (keep original text to export later) ----
    df_test = pd.read_csv(test_csv, dtype={"text": "string", "label": "int64"})
    df_test = df_test.dropna(subset=["text", "label"]).reset_index(drop=True)

    def preprocess(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=256,
        )

    test_ds_raw = Dataset.from_pandas(df_test, preserve_index=False)
    test_enc    = test_ds_raw.map(preprocess, batched=True, remove_columns=["text"])

    # ---- Minimal eval args (no extra logging/saving) ----
    eval_args = TrainingArguments(
        output_dir=os.path.join(out_dir, "eval_tmp"),
        per_device_eval_batch_size=256,
        dataloader_num_workers=0,
        fp16=False, bf16=True,#torch.cuda.is_available(),
        report_to=[],
        logging_strategy="no",
        save_strategy="no",
    )

    trainer = Trainer(
        model=model,
        args=eval_args,
        processing_class=tokenizer,
        compute_metrics=compute_metrics,   # <- reuses your metrics
    )

    # Option A: full prediction object (preds + metrics)
    pred_out = trainer.predict(test_enc)
    logits = pred_out.predictions
    y_true = np.array(df_test["label"].tolist())
    y_pred = np.argmax(logits, axis=-1)

    print("=== Metrics from compute_metrics ===")
    for k, v in pred_out.metrics.items():
        print(f"{k}: {v}")

    # (Optional) nice sklearn report
    print("\n=== Classification report ===")
    print(classification_report(y_true, y_pred, digits=4))

    # ---- Save per-row predictions (with text) ----
    # If you want probability of the positive class for inspection/AUC plots:
    # use softmax for well-calibrated probabilities
    probs_pos = (np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True))[:, 1]

    out_csv = os.path.join(out_dir, "test_predictions.csv")
    pd.DataFrame({
        "text": df_test["text"],
        "label": y_true,
        "pred": y_pred,
        "prob_pos": probs_pos,
    }).to_csv(out_csv, index=False, encoding="utf-8")
    print(f"\nðŸ“„ Saved per-sample predictions to: {out_csv}")

    return pred_out.metrics  # dict


In [4]:
for dataset in ["corpus", "stormfront", "reddit_gab", "unified"]:
    evaluate_deberta("tda", dataset)


`torch_dtype` is deprecated! Use `dtype` instead!


Map:   0%|          | 0/3419 [00:00<?, ? examples/s]



=== Metrics from compute_metrics ===
test_loss: 0.9114176034927368
test_accuracy: 0.6905527932143902
test_macro_f1: 0.4537909811311638
test_f1_hate: 0.09417808219178082
test_f1_no_hate: 0.8134038800705468
test_f0.5: 0.12415349887133183
test_f2: 0.07586206896551724
test_auc: 0.5017793744716822
test_precision_hate: 0.15759312320916904
test_recall_hate: 0.06715506715506715
test_weighted_f1: 0.6411178524417929
test_weighted_precision: 0.6089596189790631
test_weighted_recall: 0.6905527932143902
test_runtime: 5.4648
test_samples_per_second: 625.642
test_steps_per_second: 0.915

=== Classification report ===
              precision    recall  f1-score   support

           0     0.7511    0.8869    0.8134      2600
           1     0.1576    0.0672    0.0942       819

    accuracy                         0.6906      3419
   macro avg     0.4544    0.4770    0.4538      3419
weighted avg     0.6090    0.6906    0.6411      3419


ðŸ“„ Saved per-sample predictions to: deberta_corpus_tda_output

Map:   0%|          | 0/1095 [00:00<?, ? examples/s]



=== Metrics from compute_metrics ===
test_loss: 0.4506986439228058
test_accuracy: 0.9287671232876712
test_macro_f1: 0.7996876055386694
test_f1_hate: 0.6388888888888888
test_f1_no_hate: 0.9604863221884499
test_f0.5: 0.6845238095238095
test_f2: 0.5989583333333334
test_auc: 0.9198461538461538
test_precision_hate: 0.71875
test_recall_hate: 0.575
test_weighted_f1: 0.9252427678542514
test_weighted_precision: 0.9237216668723518
test_weighted_recall: 0.9287671232876712
test_runtime: 0.8495
test_samples_per_second: 1289.025
test_steps_per_second: 2.354

=== Classification report ===
              precision    recall  f1-score   support

           0     0.9489    0.9723    0.9605       975
           1     0.7188    0.5750    0.6389       120

    accuracy                         0.9288      1095
   macro avg     0.8338    0.7737    0.7997      1095
weighted avg     0.9237    0.9288    0.9252      1095


ðŸ“„ Saved per-sample predictions to: deberta_stormfront_tda_output/test_predictions.csv


Map:   0%|          | 0/5512 [00:00<?, ? examples/s]



=== Metrics from compute_metrics ===
test_loss: 0.4267905056476593
test_accuracy: 0.9029390420899854
test_macro_f1: 0.8962149819444716
test_f1_hate: 0.8697980043806279
test_f1_no_hate: 0.9226319595083152
test_f0.5: 0.8529832935560859
test_f2: 0.8872889771598809
test_auc: 0.9530866233353679
test_precision_hate: 0.8421300659754948
test_recall_hate: 0.8993457473578259
test_weighted_f1: 0.9035860471645717
test_weighted_precision: 0.9053606384839253
test_weighted_recall: 0.9029390420899854
test_runtime: 4.3757
test_samples_per_second: 1259.681
test_steps_per_second: 1.828

=== Classification report ===
              precision    recall  f1-score   support

           0     0.9410    0.9050    0.9226      3525
           1     0.8421    0.8993    0.8698      1987

    accuracy                         0.9029      5512
   macro avg     0.8916    0.9022    0.8962      5512
weighted avg     0.9054    0.9029    0.9036      5512


ðŸ“„ Saved per-sample predictions to: deberta_reddit_gab_tda_output

Map:   0%|          | 0/8276 [00:00<?, ? examples/s]



=== Metrics from compute_metrics ===
test_loss: 0.6262568235397339
test_accuracy: 0.862010633156114
test_macro_f1: 0.8455469833230738
test_f1_hate: 0.7951202009329028
test_f1_no_hate: 0.8959737657132447
test_f0.5: 0.7985585585585585
test_f2: 0.7917113254733833
test_auc: 0.9215170938208381
test_precision_hate: 0.8008673653776653
test_recall_hate: 0.7894549340933381
test_weighted_f1: 0.8617669077699848
test_weighted_precision: 0.8615666813116527
test_weighted_recall: 0.862010633156114
test_runtime: 6.3384
test_samples_per_second: 1305.689
test_steps_per_second: 1.735

=== Classification report ===
              precision    recall  f1-score   support

           0     0.8927    0.8993    0.8960      5469
           1     0.8009    0.7895    0.7951      2807

    accuracy                         0.8620      8276
   macro avg     0.8468    0.8444    0.8455      8276
weighted avg     0.8616    0.8620    0.8618      8276


ðŸ“„ Saved per-sample predictions to: deberta_unified_tda_output/test