# RoBERTa Finetuning
In this notebook we will:

1. Load the cleaned users.csv and posts.csv generated by the previous notebook.
2. Buid a user-level document = bio/description + tweets.
3. Split users into **train** / **validation** / **test** (stratified by label)
4. Fine-Tune **RoBERTa-base** on GPU.
5. Evaluate (accuracy, macro F1, AUROC)
6. Save the model and metrics.

# Section 1: Imports and Configurations

In [None]:
#---------------------------------------------------------------------------------#
# HuggingFace Cache Location                                                      #
#---------------------------------------------------------------------------------#
# By default, HuggingFace downloads pretrained models into the user directory     #
# (e.g., ~/.cache/huggingface/). To make the project fully reproducible and       #
# avoid polluting the user's  global cache, we redirect HF_HOME to a local        #
# folder inside the project.                                                      #
#                                                                                 #
# If you prefer a different cache directory, simply modify HF_CACHE below.        #
# If the folder does not exist yet, HuggingFace will create it automatically.     #
#---------------------------------------------------------------------------------#
import os
from pathlib import Path

ROOT = Path.cwd().parent.resolve()
HF_CACHE = ROOT / "hf_cache"
os.environ["HF_HOME"] = str(HF_CACHE)

import random
import time
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import torch
from torch.utils.data import Dataset

from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding
)

import matplotlib.pyplot as plt
import seaborn as sns

# --- Paths and Configurations --- #
DATA_DIR   = ROOT / "data/twibot22/processed"
OUTPUT_DIR = ROOT / "outputs/roberta_twibot22"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MODEL_NAME = "roberta-base"

SEED           = 42
MAX_SEQ_LENGTH = 256
BATCH_SIZE     = 4
NUM_EPOCHS     = 4
LEARNING_RATE  = 2e-5

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print(f"Using cuda on {torch.cuda.get_device_name()}")
else:
    print("Using cpu")

def space():
    print("\n" ,"-" * 100, "\n")

# Section 2: Loading cleaned Twibot-22 data

In [None]:
users_path = DATA_DIR / "users.csv"
posts_path = DATA_DIR / "posts.csv"

users_df = pd.read_csv(str(users_path))
posts_df = pd.read_csv(str(posts_path))

print("Users shape:", users_df.shape)
print("Posts shape:", posts_df.shape)

space()
    
print("User label distribution (0: Human, 1: Bot):")
print(users_df['label'].value_counts())

display(users_df.head())
display(posts_df.head())

# Section 3: Building user-level documents
We create a single text per user: bio/description + concatenated tweets.

In [None]:
# --- Aggregating tweets per user (concatenated text) --- #
posts_agg = (
    posts_df.groupby("id")["text"]
    .apply(lambda x: " ".join(x.astype(str)))
    .reset_index()
    .rename(columns={"text": "tweets_text"})
)

# --- Merging into users --- #
data_df = users_df.merge(posts_agg, on="id", how="left")
data_df["tweets_text"] = data_df["tweets_text"].fillna("")

# --- Explicit Bio text --- #
data_df["bio_text"] = data_df["description"].fillna("")

MAX_TWEET_CHARS = 5000
data_df["tweets_text"] = data_df["tweets_text"].str.slice(0, MAX_TWEET_CHARS)

# --- Full text = description + tweets --- #
data_df["full_text"] = (
    "Bio: " + data_df["bio_text"] + " Posts: " + data_df["tweets_text"]
).str.strip()

# --- Length stats --- #
data_df["char_length"] = data_df["full_text"].apply(len)
print("Doc length summary (chars):")
display(data_df["char_length"].describe())

space()

# --- Filtering out users with every little text --- #
min_char = 50
data_df = data_df[data_df["char_length"] >= min_char].reset_index(drop=True)
print("Users after min_char filter:", len(data_df))
print(data_df["label"].value_counts())

In [None]:
humans = data_df[data_df["label_num"] == 0]
bots = data_df[data_df["label_num"] == 1]

n = min(len(humans), len(bots))
humans_bal = humans.sample(n, random_state=SEED)
bots_bal = bots.sample(n, random_state=SEED)

data_bal = pd.concat([humans_bal, bots_bal]).sample(frac=1, random_state=SEED).reset_index(drop=True)

print("Users after balancing:", len(data_bal))
print(data_bal["label"].value_counts())

# Section 4: Train / Val / Test split (user-level, stratified)

In [None]:
train_val_df, test_df = train_test_split(
    data_bal,
    test_size=0.2,
    stratify=data_bal["label_num"],
    random_state=SEED
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.2,
    stratify=train_val_df["label_num"],
    random_state=SEED
)

print("Train users:", len(train_df))
print("Val users:", len(val_df))
print("Test users:", len(test_df))

space()

print("\nTrain label dist:")
print(train_df["label"].value_counts())

space()

print("\nVal label dist:")
print(val_df["label"].value_counts())

space()

print("\nTest label dist:")
print(test_df["label"].value_counts())

# Section 5: Preparing tokenizer and dataset class

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class UserTextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts     = df["full_text"].tolist()
        self.labels    = df["label_num"].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text  = str(self.texts[idx])
        label = self.labels[idx]
        enc   = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_len,
            return_tensors="pt"
        )
        # enc["input_ids"] has shape [1, seq_len], we squeeze later in collator
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

train_dataset = UserTextDataset(train_df, tokenizer, MAX_SEQ_LENGTH)
val_dataset   = UserTextDataset(val_df,   tokenizer, MAX_SEQ_LENGTH)
test_dataset  = UserTextDataset(test_df,  tokenizer, MAX_SEQ_LENGTH)

print("Train dataset size:", len(train_dataset))
print("Val dataset size:",   len(val_dataset))
print("Test dataset size:",  len(test_dataset))

# Section 6: Loading RoBERTa model

In [None]:
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

model.to(device)
print("Model loaded on:", device)

# Section 7: Training Arguments and Trainer
We use **HuggingFace's Trainer** to simplify the fine-tuning process.

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR / "checkpoints"),
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=50,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to=[]
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    macro_f1 = report["macro avg"]["f1-score"]
    accuracy = report["accuracy"]
    return {
        "accuracy": accuracy,
        "macro_f1": macro_f1
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Section 8: Fine-tuning RoBERTa

In [None]:
%%time

print("Starting to train (might take a bit)...")
train_result = trainer.train()
trainer.save_model(str(OUTPUT_DIR / "roberta_twibot22_model"))
tokenizer.save_pretrained(str(OUTPUT_DIR / "roberta_twibot22_model"))

print("Training finished.")

# Section 9: Evaluation on validation and test sets

In [None]:
val_preds = trainer.predict(val_dataset)
val_logits = val_preds.predictions
val_labels = val_preds.label_ids

val_probs = torch.softmax(torch.tensor(val_logits), dim=-1).numpy()[:, 1]

print("Validation labels shape:", val_labels.shape)
print("Validation probs shape:", val_probs.shape)

In [None]:
thresholds = np.linspace(0.1, 0.9, 17)

best_thr = 0.5
best_macro_f1 = -1
results = []

for thr in thresholds:
    val_pred_label_thr = (val_probs >= thr).astype(int)
    report = classification_report(
        val_labels, val_pred_label_thr, output_dict=True, zero_division=0
    )
    macro_f1 = report["macro avg"]["f1-score"]
    bot_recall = report["1"]["recall"]
    results.append((thr, macro_f1, bot_recall))
    print(f"thr={thr:.2f} | macro F1={macro_f1:.3f} | bot recall={bot_recall:.3f}")

    if macro_f1 > best_macro_f1:
        best_macro_f1 = macro_f1
        best_thr = thr

print("\nBest threshold on validation:")
print(f"thr={best_thr:.2f} with macro F1={best_macro_f1:.3f}")

In [None]:
# --- Evaluating on validation set --- #
print("Validation set evaluation:")
val_metrics = trainer.evaluate(val_dataset)
print(val_metrics)

# Predicting on test set --- #
test_preds = trainer.predict(test_dataset)
test_logits = test_preds.predictions
test_labels = test_preds.label_ids
test_probs = torch.softmax(torch.tensor(test_logits), dim=-1).numpy()[:, 1]

thr = best_thr
test_pred_labels_thr = (test_probs >= thr).astype(int)

print("Test set classification report (threshold):")
print(classification_report(test_labels, test_pred_labels_thr, digits=3, zero_division=0))


try:
    auroc = roc_auc_score(test_labels, test_probs)
    print("Test AUROC:", auroc)
except Exception as e:
    print("Could not compute AUROC:", e)

cm = confusion_matrix(test_labels, test_pred_labels_thr)

# Section 10: Confusion matrix and save metrics

In [None]:
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
           xticklabels=["human", "bot"],
           yticklabels=["human","bot"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("RoBERTa Twibot-22 - Confusion Matrix (Test)")
plt.tight_layout()
plt.savefig(str(OUTPUT_DIR / "confusion_matrix_test.png"))
plt.show()

# --- Saving metrics to JSON/text for the REPORT
import json

metrics_out = {
    "val_metrics": val_metrics,
    "test_report": classification_report(test_labels, test_pred_labels_thr, digits=3, zero_division=0, output_dict=True),
    "test_auroc": float(roc_auc_score(test_labels, test_probs)) if len(np.unique(test_labels)) == 2 else None 
}
with open(str(OUTPUT_DIR / "metrics.json"), "w") as f:
    json.dump(metrics_out, f, indent=2)

print("Metrics saved to:", OUTPUT_DIR / "metrics.json")