In [None]:
import random # used for shuffling values
import numpy as np # used for arrays, math operations, seed control
import torch # the deep learning framework used by HuggingFace models

from datasets import load_dataset, concatenate_datasets, DatasetDict, Value # load datasets from HF Hub, merge datasets, create/train/val/test splits
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
# AutoTokenizer = Automatically loads the correct tokenizer for whatever model you use.
# AutoModelForSequenceClassification = Loads the neural network classifier head (for sarcasm detection).
# TraingingArguments = Defines training hyperparameters
# Trainer = Training Loops
import evaluate # Just evaluate things (Giving F1 scores etc)

SEED = 42 # Dataset shuffling
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu" # checks if PyTorch detects a valid nvidia gpu
print("Using device:", device)
if device == "cuda":
    props = torch.cuda.get_device_properties(0)
    print("GPU:", props.name)
    print("Total VRAM (GB):", round(props.total_memory / 1024**3, 2))


Using device: cuda
GPU: NVIDIA GeForce RTX 4070 Laptop GPU
Total VRAM (GB): 8.0


Setting Gobal Parameters

In [None]:
MODEL_NAME = "roberta-base"
MAX_SAMPLES_PER_DATASET = 200_000 # Sets an upper limit so no dataset exceeds 200k samples, preventing memory overload.
MAX_LENGTH = 96 # Limits each tokenized text to 96 tokens to save GPU VRAM

def cap(ds, max_samples, name):
    n = len(ds)# Get dataset length
    if n <= max_samples:
        print(f"{name}: using {n}")
        return ds # If the dataset size n is less than or equal to max_samples, you don’t need to cut it.
    print(f"{name}: {n} -> capping to {max_samples}") # If dataset is too big, you cap it
    return ds.shuffle(seed=SEED).select(range(max_samples)) # Randomly shuffle and select a subset

# This function keeps only the first max_samples randomly shuffled rows from a dataset so huge datasets don’t dominate training.
# Cap is for limiting the dataset so one huge dataset cannot dominate training, limit each dataset to at most max_samples examples, in a random way
# ds = dataset,


Loads the dataset and Preprocessing

In [None]:
from datasets import Value

all_datasets = []

# TweetEval – irony
tweet_eval = load_dataset("cardiffnlp/tweet_eval", "irony") # loads the datasets

def force_int(example):
    return {"label": int(example["label"])}
# Make sure its Python ints

for split in ["train", "validation", "test"]:
    ds = tweet_eval[split].map(force_int)
    ds = ds.cast_column("label", Value("int64"))
    ds = ds.rename_columns({"text": "text"})
    if "source" in ds.column_names:
        ds = ds.remove_columns("source") # removing the exsiting source if any
    ds = ds.add_column("source", ["tweet_eval_irony"] * len(ds)) # adding our on source for tracking later
    all_datasets.append(ds) # Store the cleaned dataset
# Make sure its all the same schema since mismatched feature types break concatenation.

# News headlines sarcasm
news = load_dataset("Heschmat/news-headlines-dataset-sarcasm-detection")["train"] # Loads the Train dataset for News
news = news.rename_columns({"headline": "text", "is_sarcastic": "label"}) # again to match everything
news = news.map(lambda ex: {"label": int(ex["label"])}) # Convert labels to int
news = news.cast_column("label", Value("int64")) # Convert labels to int
if "source" in news.column_names:
    news = news.remove_columns("source")
news = news.add_column("source", ["news_headlines"] * len(news))
all_datasets.append(cap(news, MAX_SAMPLES_PER_DATASET, "news_headlines")) #calling the cap function we had before, Contains around 267,000 examples.


news_headlines: using 26709


In [None]:
from datasets import load_dataset, Value

# Daniel2588 Sarcasm Dataset (Reddit comments)

sar_dan = load_dataset("daniel2588/sarcasm")
print(sar_dan["train"].column_names)




['label', 'comment', 'author', 'subreddit', 'score', 'ups', 'downs', 'date', 'created_utc', 'parent_comment']


In [None]:
# Daniel2588 Sarcasm Dataset (Reddit comments)

sar_dan = load_dataset("daniel2588/sarcasmdata")["train"]  # only train
print("daniel train size:", len(sar_dan))
print("daniel columns:", sar_dan.column_names)

def daniel_merge_context(example):
    parent = (example.get("parent_comment") or "").strip()
    comment = (example.get("comment") or "").strip()

    if parent:
        text = parent + " [SEP] " + comment
    else:
        text = comment

    return {"text": text}
# Build a single 'text' field:  parent_comment [SEP] comment ,if parent_comment is empty, just use comment.
def daniel_clean_label(example):

    lab = int(example.get("label", 0))
    if lab < 0:
        lab = 0
    if lab > 1:
        lab = 1
    return {"label": lab}
# Convert label:'sarcastic' =  1' not_sarcastic' = 0
# Create unified 'text' field using parent_comment + comment
dan_ds = sar_dan.map(daniel_merge_context)

# Clean labels and cast to int64
dan_ds = dan_ds.map(daniel_clean_label)
dan_ds = dan_ds.cast_column("label", Value("int64"))

# Keep only text + label; drop author, subreddit, score, etc.
cols_to_keep = {"text", "label"}
cols_to_drop = [c for c in dan_ds.column_names if c not in cols_to_keep]
dan_ds = dan_ds.remove_columns(cols_to_drop)

# Add our own source column for tracking
if "source" in dan_ds.column_names:
    dan_ds = dan_ds.remove_columns("source")
dan_ds = dan_ds.add_column("source", ["daniel_sarcasm"] * len(dan_ds))

# Cap so it doesn't dominate other datasets
dan_ds = cap(dan_ds, MAX_SAMPLES_PER_DATASET, "daniel_sarcasm_train")
all_datasets.append(dan_ds)


daniel train size: 28111
daniel columns: ['label', 'text']
daniel_sarcasm_train: using 28111


In [None]:

# jokerdd0727/Sarcasm_Detection Sarcasm Dataset

sar_Joker = load_dataset("jokerdd0727/Sarcasm_Detection")  # loads available splits (e.g. 'train', maybe 'test')
print(sar_Joker["train"].column_names)

['is_sarcastic', 'headline', 'article_link']


In [None]:
# jokerdD0727/Sarcasm_Detection  Dataset

sar_joker = load_dataset("jokerdD0727/Sarcasm_Detection")["train"]
print("joker columns:", sar_joker.column_names)
print("joker size:", len(sar_joker))

# Rename columns to match our unified schema
sar_joker = sar_joker.rename_columns({
    "headline": "text",
    "is_sarcastic": "label"
})

# Convert label to integer
sar_joker = sar_joker.map(lambda ex: {"label": int(ex["label"])})
sar_joker = sar_joker.cast_column("label", Value("int64"))

# Remove unused columns
cols_to_keep = {"text", "label"}
cols_to_drop = [c for c in sar_joker.column_names if c not in cols_to_keep]
sar_joker = sar_joker.remove_columns(cols_to_drop)

# Add source column
if "source" in sar_joker.column_names:
    sar_joker = sar_joker.remove_columns("source")

sar_joker = sar_joker.add_column("source", ["sarcasm_joker"] * len(sar_joker))

# Cap
sar_joker = cap(sar_joker, MAX_SAMPLES_PER_DATASET, "sarcasm_joker")

# Add to master list
all_datasets.append(sar_joker)

print("joker cleaned:", sar_joker)


joker columns: ['is_sarcastic', 'headline', 'article_link']
joker size: 28619
sarcasm_joker: using 28619
joker cleaned: Dataset({
    features: ['label', 'text', 'source'],
    num_rows: 28619
})


In [None]:
# marcbishara/sarcasm-on-reddit  Dataset

sar_marc = load_dataset("marcbishara/sarcasm-on-reddit")  # has splits
print("marc splits:", sar_marc.keys())

def marc_merge_context(example):
    parent  = (example.get("parent_comment") or "").strip()
    comment = (example.get("comment") or "").strip()
    if parent:
        text = parent + " [SEP] " + comment
    else:
        text = comment
    return {"text": text}
# Build a single 'text' field:  parent_comment [SEP] comment ,if parent_comment is empty, just use comment.
def marc_label_to_int(example):
    lab = example.get("label")
    if isinstance(lab, str):
        lab_str = lab.strip().lower()
        y = 1 if lab_str == "sarcastic" else 0
    else:
        y = int(lab)
        if y < 0: y = 0
        if y > 1: y = 1
    return {"label": y}
# Convert label:'sarcastic' =  1' not_sarcastic' = 0
# We only use the supervised SFT splits for classification
for split in ["sft_train", "sft_validation"]:
    if split not in sar_marc:
        continue

    ds = sar_marc[split]

    # Create unified 'text' with context
    ds = ds.map(marc_merge_context)

    # Map labels to 0/1 and cast
    ds = ds.map(marc_label_to_int)
    ds = ds.cast_column("label", Value("int64"))

    # Keep only text + label
    cols_to_keep = {"text", "label"}
    cols_to_drop = [c for c in ds.column_names if c not in cols_to_keep]
    ds = ds.remove_columns(cols_to_drop)

    # Add our own source column
    if "source" in ds.column_names:
        ds = ds.remove_columns("source")
    ds = ds.add_column("source", [f"marc_reddit_{split}"] * len(ds))

    # Cap so this big dataset doesn't dominate training
    ds = cap(ds, MAX_SAMPLES_PER_DATASET, f"marc_reddit_{split}")
    all_datasets.append(ds)


marc splits: dict_keys(['holdout', 'sft_train', 'sft_validation', 'reward_train', 'reward_validation', 'ppo_train', 'ppo_validation'])
marc_reddit_sft_train: 272922 -> capping to 200000
marc_reddit_sft_validation: using 30325


In [None]:
# Shankhadeep144/sarcastic  (sarcastic vs not sarcastic)
sar_shank = load_dataset("Shankhadeep144/sarcastic")["train"]

def shank_fix_labels(example):
    # Convert text label = int
    # sarcastic = 1
    # not sarcastic = 0
    lbl = example["label"].strip().lower()
    return {"label": 1 if lbl == "sarcastic" else 0}

    # Map text + clean label
    ds = ds.rename_columns({"headline": "text"})
    ds = ds.map(shank_fix_labels)

    # Ensure correct dtype
    ds = ds.cast_column("label", Value("int64"))

    # Remove old "source" if exists
    if "source" in ds.column_names:
        ds = ds.remove_columns("source")

    # Add our standardized source tag
    ds = ds.add_column("source", ["shankhadeep_sarcastic"] * len(ds))

    # Append
    all_datasets.append(ds)

print("Added Shankhadeep dataset splits:", sar_shank.keys())


Added Shankhadeep dataset splits: dict_keys(['train', 'test'])


In [None]:
# salsabilahasna/Sarcasm_Dataset

sar_salsa = load_dataset("salsabilahasna/Sarcasm_Dataset")
print("salsabila splits:", sar_salsa.keys())

for split in sar_salsa.keys():   # 'train' and 'test'
    ds = sar_salsa[split]

    # Rename to match our unified schema
    ds = ds.rename_columns({
        "text": "text",
        "Y": "label"
    })

    # Ensure label is int64 (0 = not sarcastic, 1 = sarcastic)
    ds = ds.map(lambda ex: {"label": int(ex["label"])})
    ds = ds.cast_column("label", Value("int64"))

    # Remove any existing 'source' just in case
    if "source" in ds.column_names:
        ds = ds.remove_columns("source")

    # Add our standardized source column
    ds = ds.add_column("source", [f"salsabila_sarcasm_{split}"] * len(ds))

    # Cap if you want (20k + 8.5k, so this is optional but consistent)
    ds = cap(ds, MAX_SAMPLES_PER_DATASET, f"salsabila_sarcasm_{split}")

    all_datasets.append(ds)

print("Added salsabilahasna/Sarcasm_Dataset")


salsabila splits: dict_keys(['train', 'test'])
salsabila_sarcasm_train: using 20033
salsabila_sarcasm_test: using 8586
Added salsabilahasna/Sarcasm_Dataset


In [None]:
print("Loaded datasets:", len(all_datasets))
for ds in all_datasets:
    print(ds.features)

Loaded datasets: 12
{'text': Value('string'), 'label': Value('int64'), 'source': Value('string')}
{'text': Value('string'), 'label': Value('int64'), 'source': Value('string')}
{'text': Value('string'), 'label': Value('int64'), 'source': Value('string')}
{'text': Value('string'), 'label': Value('int64'), 'source': Value('string')}
{'label': Value('int64'), 'text': Value('string'), 'source': Value('string')}
{'label': Value('int64'), 'text': Value('string'), 'source': Value('string')}
{'label': Value('int64'), 'text': Value('string'), 'source': Value('string')}
{'label': Value('int64'), 'text': Value('string'), 'source': Value('string')}
{'text': Value('string'), 'label': Value('int64'), 'source': Value('string')}
{'text': Value('string'), 'label': Value('int64'), 'source': Value('string')}
{'label': Value('int64'), 'text': Value('string'), 'source': Value('string')}
{'label': Value('int64'), 'text': Value('string'), 'source': Value('string')}


In [None]:
combined = concatenate_datasets(all_datasets)
print("Total examples:", len(combined))

def normalize_label(example):
    return {"label": int(example["label"])}

combined = combined.map(normalize_label)

tmp = combined.train_test_split(test_size=0.2, seed=SEED)
val_test = tmp["test"].train_test_split(test_size=0.5, seed=SEED)

dataset_dict = DatasetDict(
    train=tmp["train"],
    validation=val_test["train"],
    test=val_test["test"],
)

print(dataset_dict)
print("Train size:", len(dataset_dict["train"]))
print("Val size:", len(dataset_dict["validation"]))
print("Test size:", len(dataset_dict["test"]))



Total examples: 375603


Map:   0%|          | 0/375603 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'source'],
        num_rows: 300482
    })
    validation: Dataset({
        features: ['text', 'label', 'source'],
        num_rows: 37560
    })
    test: Dataset({
        features: ['text', 'label', 'source'],
        num_rows: 37561
    })
})
Train size: 300482
Val size: 37560
Test size: 37561


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )

encoded = dataset_dict.map(
    preprocess,
    batched=True,
    remove_columns=["text", "source"]
)

encoded


Map:   0%|          | 0/300482 [00:00<?, ? examples/s]

Map:   0%|          | 0/37560 [00:00<?, ? examples/s]

Map:   0%|          | 0/37561 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 300482
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 37560
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 37561
    })
})

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
    }


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
)

training_args = TrainingArguments(
    output_dir="./ultimate_sarcasm_roberta",
    eval_strategy="steps",
    eval_steps=2000,
    save_strategy="steps",
    save_steps=2000,
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_ratio=0.05,
    logging_steps=100,
    fp16=True,
    report_to=["none"],
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy,F1
2000,0.5547,0.487302,0.740335,0.7402
4000,0.5019,0.481403,0.754047,0.752466
6000,0.5031,0.451706,0.771353,0.770452
8000,0.4736,0.429259,0.776118,0.775951
10000,0.4461,0.42159,0.785596,0.784956
12000,0.4649,0.411535,0.790495,0.788902
14000,0.4224,0.424752,0.783733,0.783694
16000,0.4406,0.40442,0.79574,0.793423
18000,0.4065,0.407644,0.800053,0.798423
20000,0.3726,0.402783,0.802689,0.802686


TrainOutput(global_step=56343, training_loss=0.3774070675934224, metrics={'train_runtime': 12413.8086, 'train_samples_per_second': 72.616, 'train_steps_per_second': 4.539, 'total_flos': 4.447132657688448e+16, 'train_loss': 0.3774070675934224, 'epoch': 3.0})

In [None]:
print("Evaluating on test set...")
metrics = trainer.evaluate(encoded["test"])
print("Test metrics:", metrics)

save_path = "D:/hf_models/ultimate_sarcasm_detector_v2"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print("Model & tokenizer saved to:", save_path)


Evaluating on test set...


Test metrics: {'eval_loss': 0.4203392565250397, 'eval_accuracy': 0.8128111605122335, 'eval_f1': 0.812504113675636, 'eval_runtime': 86.1941, 'eval_samples_per_second': 435.772, 'eval_steps_per_second': 54.482, 'epoch': 3.0}
Model & tokenizer saved to: D:/hf_models/ultimate_sarcasm_detector_v2


In [None]:
label_map = {0: "NOT SARCASTIC", 1: "SARCASTIC"}

def predict_sarcasm(text: str, context: str | None = None):
    if context:
        full_text = f"Context: {context}\nReply: {text}"
    else:
        full_text = text

    inputs = tokenizer(
        full_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
    ).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()

    pred_id = int(probs.argmax())
    print("Text:", text)
    print("Prediction:", label_map[pred_id])
    print(f"P(Not Sarcastic): {probs[0]:.3f}")
    print(f"P(Sarcastic):     {probs[1]:.3f}")

# Example:
predict_sarcasm("Wow, my code crashing 10 times in a row is exactly not what I needed today.")


Text: Wow, my code crashing 10 times in a row is exactly not what I needed today.
Prediction: SARCASTIC
P(Not Sarcastic): 0.025
P(Sarcastic):     0.975


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

model_path = r"D:\hf_models\ultimate_sarcasm_detector_v2"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

label_map = {0: "NOT_SARCASTIC", 1: "SARCASTIC"}

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)[0]

    pred_id = int(torch.argmax(probs))
    return {
        "text": text,
        "prediction": label_map[pred_id],
        "prob_not_sarcastic": float(probs[0]),
        "prob_sarcastic": float(probs[1])
    }

# Example
print(predict("This is very bad"))


{'text': 'This is very bad', 'prediction': 'NOT_SARCASTIC', 'prob_not_sarcastic': 0.8270643949508667, 'prob_sarcastic': 0.1729355752468109}


In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Value
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = r"D:\hf_models\ultimate_sarcasm_detector_v2"


# 1. LOAD MODEL
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
print("Model loaded!")


# 2. CHOOSE WHICH TRAINING DATASET TO TEST ON

DATASET = "Sar"

if DATASET == "tweet_eval":
    ds = load_dataset("cardiffnlp/tweet_eval", "irony")["test"]
    ds = ds.rename_columns({"text": "text", "label": "label"})
    LABEL_MAP = {0: 0, 1: 1}

elif DATASET == "news":
    ds = load_dataset("Heschmat/news-headlines-dataset-sarcasm-detection")["train"]
    ds = ds.rename_columns({"headline": "text", "is_sarcastic": "label"})
    LABEL_MAP = {0: 0, 1: 1}

elif DATASET == "marc":
    ds = load_dataset("marcbishara/sarcasm-on-reddit")["sft_validation"]
    def marc_merge_context(example):
      parent  = (example.get("parent_comment") or "").strip()
      comment = (example.get("comment") or "").strip()
      if parent:
          text = parent + " [SEP] " + comment
      else:
          text = comment
      return {"text": text}

    def marc_label_to_int(example):

      lab = example.get("label")
      if isinstance(lab, str):
          lab_str = lab.strip().lower()
          y = 1 if lab_str == "sarcastic" else 0
      else:
          y = int(lab)
          if y < 0: y = 0
          if y > 1: y = 1
      return {"label": y}

    ds = ds.map(marc_merge_context)
    ds = ds.map(marc_label_to_int)
    ds = ds.cast_column("label", Value("int64"))
    cols_to_keep = {"text", "label"}
    cols_to_drop = [c for c in ds.column_names if c not in cols_to_keep]
    ds = ds.remove_columns(cols_to_drop)

elif DATASET == "Shank":
    ds = load_dataset("Shankhadeep144/sarcastic")["test"]
    def shank_fix_labels(example):
      lbl = example["label"].strip().lower()
      return {"label": 1 if lbl == "sarcastic" else 0}
    ds = ds.rename_columns({"headline": "text"})
    ds = ds.map(shank_fix_labels)
    ds = ds.cast_column("label", Value("int64"))

elif DATASET == "Sar":
    sar_salsa = load_dataset("salsabilahasna/Sarcasm_Dataset")["test"]
    ds = ds.rename_columns({"label": "label","text": "text"})
    ds = ds.map(lambda ex: {"label": int(ex["label"])})
    ds = ds.cast_column("label", Value("int64"))

elif DATASET == "DrDavis":
    ds = load_dataset("DrDavis/sarcasm-english")["train"]
    def david_fix_labels(example):
        lbl = example["label"].strip().lower()
        return {"label": 1 if lbl == "sarcastic" else 0}
    ds = ds.rename_columns({"text": "text"})
    ds = ds.map(david_fix_labels)
    ds = ds.cast_column("label", Value("int64"))


else:
    raise ValueError("Unknown dataset name!")

print(f"Dataset loaded: {DATASET} | Rows: {len(ds)}")



# 3. TOKENIZE

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=96)

encoded = ds.map(tokenize, batched=True)
encoded.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


# 4. RUN PREDICTIONS
model.eval()
all_preds = []
all_labels = []

loader = torch.utils.data.DataLoader(encoded, batch_size=16)

with torch.no_grad():
    for batch in loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)

        outputs = model(ids, mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        labels = batch["label"].cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)


# 5. CONFUSION MATRIX & REPORT

print("\n===== CLASSIFICATION REPORT =====")
print(classification_report(all_labels, all_preds, target_names=["NOT_SARCASM", "SARCASM"]))

cm = confusion_matrix(all_labels, all_preds)
df_cm = pd.DataFrame(cm,
                     index=["TRUE_NOT_SARC", "TRUE_SARC"],
                     columns=["PRED_NOT_SARC", "PRED_SARC"])

print("\n===== CONFUSION MATRIX =====")
print(df_cm)


Model loaded!


Map:   0%|          | 0/5724 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5724 [00:00<?, ? examples/s]

Dataset loaded: Sar | Rows: 5724


Map:   0%|          | 0/5724 [00:00<?, ? examples/s]


===== CLASSIFICATION REPORT =====
              precision    recall  f1-score   support

 NOT_SARCASM       0.65      0.46      0.54      3058
     SARCASM       0.54      0.71      0.61      2666

    accuracy                           0.58      5724
   macro avg       0.59      0.59      0.58      5724
weighted avg       0.60      0.58      0.57      5724


===== CONFUSION MATRIX =====
               PRED_NOT_SARC  PRED_SARC
TRUE_NOT_SARC           1409       1649
TRUE_SARC                761       1905
