In [8]:
!pip install transformers datasets evaluate accelerate --quiet

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate
import torch
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import os

import os, ssl

os.environ["HF_HUB_DISABLE_SSL_VERIFICATION"] = "1"
os.environ["CURL_CA_BUNDLE"] = ""
os.environ["SSL_CERT_FILE"] = ""
os.environ["REQUESTS_CA_BUNDLE"] = ""

ssl._create_default_https_context = ssl._create_unverified_context
tokenizer = AutoTokenizer.from_pretrained("roberta-base", trust_remote_code=True)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
train_df = pd.read_json("data/PStance/processed/train.jsonl", lines=True)
val_df   = pd.read_json("data/PStance/processed/val.jsonl",   lines=True)
test_df  = pd.read_json("data/PStance/processed/test.jsonl",  lines=True)

print(train_df.columns)


Index(['text', 'stance', 'input_ids', 'attention_mask'], dtype='object')


In [10]:
df = pd.read_csv("data/PStance/processed/cleaned_pstance.csv")
df.head()


Unnamed: 0,text,Stance,clean_tweet,clean_target
0,"i endorse bernie for tons of reasons, but this...",FAVOR,"i endorse bernie for tons of reasons, but this...",bernie sanders
1,a big problem wbernie left is not only preoccu...,AGAINST,a big problem wbernie left is not only preoccu...,bernie sanders
2,this poll is not reflecting anything: age was ...,AGAINST,this poll is not reflecting anything: age was ...,bernie sanders
3,so proud how is shedding light on who is truly...,FAVOR,so proud how is shedding light on who is truly...,bernie sanders
4,"according to media bias fact checker, you have...",FAVOR,"according to media bias fact checker, you have...",bernie sanders


In [11]:
label2id = {"favor": 0, "against": 1, "none": 2}
id2label = {v: k for k, v in label2id.items()}

# Map stance → numeric
for df in (train_df, val_df, test_df):
    df["labels"] = df["stance"].str.lower().map(label2id)
    df.dropna(subset=["labels"], inplace=True)
    df["labels"] = df["labels"].astype(int)


In [12]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)
cols_to_keep = ["input_ids", "attention_mask", "labels"]


In [13]:
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Detect text column
if "text" in train_df.columns:
    text_col = "text"
elif "tweet" in train_df.columns:
    text_col = "tweet"
else:
    candidates = [c for c in train_df.columns if c not in ["labels", "stance"]]
    text_col = candidates[0]

use_target = "target" in train_df.columns

def tokenize_function(batch):
    if use_target:
        texts = [f"{tgt} [SEP] {txt}" for tgt, txt in zip(batch["target"], batch[text_col])]
    else:
        texts = batch[text_col]
    return tokenizer(texts, truncation=True)

train_ds = train_ds.map(tokenize_function, batched=True)
valid_ds = valid_ds.map(tokenize_function, batched=True)
test_ds  = test_ds.map(tokenize_function, batched=True)


Map: 100%|██████████| 15101/15101 [00:01<00:00, 9507.85 examples/s] 
Map: 100%|██████████| 3236/3236 [00:00<00:00, 10674.07 examples/s]
Map: 100%|██████████| 3237/3237 [00:00<00:00, 5249.20 examples/s]


In [14]:
# Keep needed columns
cols_to_keep = ["input_ids", "attention_mask", "labels"]
train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in cols_to_keep])
valid_ds = valid_ds.remove_columns([c for c in valid_ds.column_names if c not in cols_to_keep])
test_ds  = test_ds.remove_columns([c for c in test_ds.column_names if c not in cols_to_keep])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [15]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label={0: "favor", 1: "against", 2: "none"},
    label2id={"favor": 0, "against": 1, "none": 2},
)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir="./roberta_results",
    
    do_train=True,
    do_eval=True,

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    num_train_epochs=3,
    weight_decay=0.01,

    # NEW TRANSFORMERS API (4.57.3)
    eval_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",

    logging_steps=100,
    save_steps=500,
    eval_steps=500,

    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",

    report_to="none"
)


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

  trainer = Trainer(


: 

In [None]:
test_results = trainer.evaluate(test_ds)
print("Test results:", test_results)
preds = trainer.predict(test_ds)
y_true = preds.label_ids
y_pred = preds.predictions.argmax(axis=1)
cm = confusion_matrix(y_true, y_pred, labels=[0,1,2])
disp = ConfusionMatrixDisplay(cm, display_labels=["favor","against","none"])
disp.plot(values_format="d")
plt.title("RoBERTa — Confusion Matrix (P-Stance)")
plt.show()


In [None]:
# RUN THIS ONLY ONE TIME
pred_output = trainer.predict(test_ds)
np.save("y_true.npy", pred_output.label_ids)
np.save("y_pred.npy", np.argmax(pred_output.predictions, axis=1))
print("Saved predictions.")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
# Convert predictions
y_true = pred_output.label_ids
y_pred = np.argmax(pred_output.predictions, axis=1)

# ============================
# CLASSIFICATION REPORT VISUAL
# ============================

report = classification_report(
    y_true,
    y_pred,
    labels=[0,1,2],
    target_names=["favor","against","none"],
    output_dict=True,
    zero_division=0
)

classes = ["favor","against","none"]
f1_scores = [report[c]["f1-score"] for c in classes]
precision = [report[c]["precision"] for c in classes]
recall = [report[c]["recall"] for c in classes]
