In [None]:
import numpy as np
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, Trainer, TrainingArguments, AutoConfig)
from peft import LoraConfig, TaskType, get_peft_model
import re
from datasets import load_dataset, concatenate_datasets, Features, Value
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from torch import nn


def set_seed(seed: int = 8):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(8)

device = "cuda"

model = "roberta"  # "finbert" or "roberta"
strategy = "FT"   # "LoRA" or "FT"

In [None]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'\s+', ' ', text)     # Remove extra spaces
    text = text.strip()                  # Remove leading/trailing spaces
    return text

ds_ax_clean = []
feats = Features({"text": Value("string"), "label": Value("int64")})
for x in ['emoji', 'emotion', 'hate', 'irony', 'offensive', 'sentiment', 'stance_abortion', 'stance_atheism', 'stance_climate', 'stance_feminist', 'stance_hillary']:
  ds = (load_dataset("cardiffnlp/tweet_eval", x)["train"])
  ds = ds.remove_columns("label")
  ds = ds.map(lambda x: {"text" : clean_text(x["text"]), "label": 3})
  ds = ds.cast(feats)
  ds_ax_clean.append(ds)
ds_ax_clean = concatenate_datasets(ds_ax_clean)


sampled_70000 = ds_ax_clean.shuffle(seed=8).select(range(min(70000, len(ds_ax_clean))))

ds_fin = load_dataset("zeroshot/twitter-financial-news-sentiment")
ds_fin = ds_fin.map(lambda x: {"text": clean_text(x["text"])})
s2_full_ds = concatenate_datasets([ds_fin["train"], ds_fin["validation"], sampled_70000])



split_test = s2_full_ds.train_test_split(train_size=0.90, seed=8)
s2_train_full_ds = split_test["train"]
s2_test_ds       = split_test["test"]

split_val = s2_train_full_ds.train_test_split(train_size=0.88, seed=8)
s2_train_ds = split_val["train"]
s2_val_ds   = split_val["test"]


s1_full_ds = s2_full_ds.map(lambda ex: {"label": 0 if ex["label"] in (0,1,2) else 1})
s1_train_full_ds = s2_train_full_ds.map(lambda ex: {"label": 0 if ex["label"] in (0,1,2) else 1})
s1_train_ds = s2_train_ds.map(lambda ex: {"label": 0 if ex["label"] in (0,1,2) else 1})
s1_val_ds   = s2_val_ds.map(lambda ex: {"label": 0 if ex["label"] in (0,1,2) else 1})
s1_test_ds = s2_test_ds.map(lambda ex: {"label": 0 if ex["label"] in (0,1,2) else 1})




In [None]:
if model =="finbert":
    MODEL_NAME = "ProsusAI/finbert"

elif model == "roberta":
    MODEL_NAME = "roberta-base"
OUTDIR = "out/exp1"
MAX_LEN = 128

tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def preprocess(batch):
    return tok(batch["text"], truncation=True, max_length=MAX_LEN)

# tokenize
s1_train_tok = s1_train_ds.map(preprocess, batched=True, remove_columns= ["text"])
s1_val_tok   = s1_val_ds.map(preprocess,   batched=True, remove_columns= ["text"])
s1_test_tok  = s1_test_ds.map(preprocess,  batched=True, remove_columns= ["text"])
s1_train_full_tok = s1_train_full_ds.map(preprocess, batched=True, remove_columns= ["text"])

for split in (s1_test_tok, s1_train_full_tok):
    split.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

collator = DataCollatorWithPadding(tokenizer=tok)

from collections import Counter

NUM_LABELS = 2  
counts = Counter(s1_train_full_ds["label"])  


total = sum(counts[c] for c in range(NUM_LABELS))
inv_freq = [total / (counts[i] if counts[i] > 0 else 1) for i in range(NUM_LABELS)]
weights = torch.tensor(inv_freq, dtype=torch.float32)
weights = weights / weights.mean() 
print("class weights (0,1,2):", weights.tolist())
weights = weights.to(device)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
    }




In [None]:
counts = np.bincount(s1_train_full_ds["label"], minlength=NUM_LABELS)
w = counts.sum() / np.maximum(counts, 1)
w = torch.tensor(w / w.mean(), dtype=torch.float32)

class CustomTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights  

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  
        labels = inputs.pop("labels")                
        outputs = model(**inputs)
        logits = outputs.logits
        if self.class_weights is None:
            loss_fct = nn.CrossEntropyLoss()
        else:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss



BATCH_SIZE = 16
EPOCHS = 5
WEIGHT_DECAY = 0.01


LORA_R = 8
LORA_ALPHA = 32
LORA_DROPOUT = 0.10

if model == "finbert":
    config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

    base = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        config=config,
        ignore_mismatched_sizes=True,   
        force_download=True,            
    )
elif model == "roberta":
    base = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
    )
if strategy == "LoRA":
    lora_cfg = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
        target_modules=["query","key","value","dense"]  
    )
    base = get_peft_model(base, lora_cfg)
    base.print_trainable_parameters()

import torch
base.to(device)

# Hyperparams (tweak if needed)
if strategy == "LoRA":
    LR = 1e-4
    training_args = TrainingArguments(
        output_dir="out/twitter-finance",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=max(32, BATCH_SIZE),
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        logging_steps=50,
        seed=8,
        fp16=(device == "cuda"),
        report_to="none",
    )
elif strategy == "FT":
    LR = 2e-5
    training_args = TrainingArguments(
        output_dir="out/twitter-finance_fullft",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=max(32, BATCH_SIZE),
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        warmup_ratio=0.1,                 
        logging_steps=50,
        seed=8,
        fp16=(device == "cuda"),
        greater_is_better=True,
        report_to="none",
    )

trainer = CustomTrainer(
    model=base,
    args=training_args,
    train_dataset=s1_train_full_tok,
    eval_dataset=None,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
    class_weights = w
)

trainer.train()
print("global_step:", trainer.state.global_step)



test_metrics = trainer.evaluate(s1_test_tok)
test_metrics


In [None]:
pred = trainer.predict(s1_test_tok)  
logits = pred.predictions            
y_true = pred.label_ids               

y_pred = logits.argmax(axis=-1)


In [None]:
s2_train_full_ds = s2_train_full_ds.filter(lambda ex: ex["label"] in (0,1,2))


s1_test_ds_preds = s2_test_ds.add_column("s1_pred", y_pred.tolist())
s1_test_ds_preds = s1_test_ds_preds.add_column("idx", list(range(len(s1_test_ds_preds))))
s2_test_ds_preds = s1_test_ds_preds.filter(lambda ex: ex["s1_pred"] == 0)
s2_test_ds_new = s2_test_ds_preds.remove_columns("s1_pred")
s2_test_ds_new = s2_test_ds_new.remove_columns("idx")

In [None]:
def preprocess(batch):

    return tok(batch["text"], truncation=True, max_length=MAX_LEN)


s2_test_tok  = s2_test_ds_new.map(preprocess,  batched=True, remove_columns= ["text"])
s2_train_full_tok = s2_train_full_ds.map(preprocess, batched=True, remove_columns= ["text"])


for split in (s2_test_tok, s2_train_full_tok):
    split.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


collator = DataCollatorWithPadding(tokenizer=tok)

NUM_LABELS = 3  
counts = Counter(s2_train_full_ds["label"])  

total = sum(counts[c] for c in range(NUM_LABELS))
inv_freq = [total / (counts[i] if counts[i] > 0 else 1) for i in range(NUM_LABELS)]
weights = torch.tensor(inv_freq, dtype=torch.float32)
weights = weights / weights.mean()  
print("class weights (0,1,2):", weights.tolist())
weights = weights.to(device)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
    }

In [None]:
counts = np.bincount(s2_train_full_ds["label"], minlength=NUM_LABELS)
w = counts.sum() / np.maximum(counts, 1)
w = torch.tensor(w / w.mean(), dtype=torch.float32)

class CustomTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights  

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  
        labels = inputs.pop("labels")                
        outputs = model(**inputs)
        logits = outputs.logits
        if self.class_weights is None:
            loss_fct = nn.CrossEntropyLoss()
        else:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss



BATCH_SIZE = 16
EPOCHS = 5
WEIGHT_DECAY = 0.01


LORA_R = 8
LORA_ALPHA = 32
LORA_DROPOUT = 0.10

if model == "finbert":
    config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

    base = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        config=config,
        ignore_mismatched_sizes=True,   
        force_download=True,            
    )
elif model == "roberta":
    base = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
    )
if strategy == "LoRA":
    lora_cfg = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
        target_modules=["query","key","value","dense"]  
    )
    base = get_peft_model(base, lora_cfg)
    base.print_trainable_parameters()


base.to(device)

if strategy == "LoRA":
    LR = 1e-4
    training_args = TrainingArguments(
        output_dir="out/twitter-finance",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=max(32, BATCH_SIZE),
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        logging_steps=50,
        seed=8,
        fp16=(device == "cuda"),
        report_to="none",
    )
elif strategy == "FT":
    LR = 2e-5
    training_args = TrainingArguments(
        output_dir="out/twitter-finance_fullft",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=max(32, BATCH_SIZE),
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        warmup_ratio=0.1,                
        logging_steps=50,
        seed=8,
        fp16=(device == "cuda"),
        greater_is_better=True,
        report_to="none",
    )

trainer = CustomTrainer(
    model=base,
    args=training_args,
    train_dataset=s2_train_full_tok,
    eval_dataset=None,
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
    class_weights = w
)

trainer.train()
print("global_step:", trainer.state.global_step)





In [None]:
pred_s2_test_tok = s2_test_tok.remove_columns("label")

In [None]:
pred = trainer.predict(pred_s2_test_tok)  
logits = pred.predictions             
y_true = pred.label_ids               


y_pred = logits.argmax(axis=-1)


In [None]:
s2_test_ds_preds = s2_test_ds_preds.add_column("s2_pred", y_pred.tolist())

In [None]:
s1_fixed = s1_test_ds_preds.map(lambda ex: {"s1_pred": 3 if ex["s1_pred"] == 1 else ex["s1_pred"]})

s2_lookup = dict(zip(s2_test_ds_preds["idx"], s2_test_ds_preds["s2_pred"]))


def replace_with_s2(batch):
    idxs = batch["idx"]
    s1p  = batch["s1_pred"]
    out  = []
    for i, p in zip(idxs, s1p):
        out.append(int(s2_lookup[i]) if i in s2_lookup else int(p))
    return {"s1_pred": out}

s1_merged = s1_fixed.map(replace_with_s2, batched=True)

In [None]:
final = s1_merged.add_column("final_pred", s1_merged["s1_pred"])

In [None]:
y_true = np.array([int(x) for x in final["label"]], dtype=int)
y_pred = np.array([int(x) for x in final["final_pred"]], dtype=int)

acc = accuracy_score(y_true, y_pred)
macro_f1 = f1_score(y_true, y_pred, average="macro")

In [None]:
print(acc, macro_f1)

In [None]:
cm = confusion_matrix(y_true, y_pred, labels=[0,1,2,3])
print(cm)