In [3]:
!pip install -q transformers datasets accelerate evaluate scikit-learn




[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import transformers
print(transformers.__version__)


4.57.1


In [5]:
from google.colab import files
uploaded = files.upload()


Saving heldout_pair_data.jsonlist to heldout_pair_data.jsonlist


In [59]:
import json, pandas as pd, re

def clean_text(t):
    t = re.sub(r"http\S+|\[.*?\]\(.*?\)", "", t)
    t = re.sub(r"\\n", " ", t)
    return re.sub(r"\s+", " ", t).strip()

path = "heldout_pair_data.jsonlist"
data = []

with open(path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        for grp, label in [("positive", 1), ("negative", 0)]:
            if grp in obj and "comments" in obj[grp]:
                for c in obj[grp]["comments"]:
                    txt = clean_text(c.get("body", ""))

                    if len(txt.split()) > 15:
                        data.append({"text": txt, "label": label})

df = pd.DataFrame(data)
print("Total samples:", len(df))
print(df["label"].value_counts(normalize=True))
df.head()


Total samples: 2127
label
1    0.532205
0    0.467795
Name: proportion, dtype: float64


Unnamed: 0,text,label
0,"In short, the reality is: It adds up. Both of ...",1
1,I agree with /u/huadpe that simply the act of ...,0
2,"It's not about monetary or practical impact, n...",1
3,"I think I see where you are coming from, espec...",0
4,"If you want cold and forests, I'd suggest Cana...",1


In [60]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df)
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1701
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 213
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 213
    })
})


In [61]:
from transformers import AutoTokenizer

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

tokenized = dataset.map(tokenize, batched=True)
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/1701 [00:00<?, ? examples/s]

Map:   0%|          | 0/213 [00:00<?, ? examples/s]

Map:   0%|          | 0/213 [00:00<?, ? examples/s]

In [62]:
from transformers import AutoModelForSequenceClassification
import torch, numpy as np

class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=0.75, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ce = torch.nn.CrossEntropyLoss(reduction='none')
    def forward(self, logits, targets):
        ce_loss = self.ce(logits, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * ((1 - pt) ** self.gamma) * ce_loss
        return focal_loss.mean()

class WeightedModel(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.loss_fn = FocalLoss(alpha=0.75, gamma=2.0)
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

model = WeightedModel("distilroberta-base", num_labels=2).to("cuda")



config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average="binary")
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [66]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import os

os.environ["WANDB_DISABLED"] = "true"
use_fp16 = torch.cuda.is_available()

args = TrainingArguments(
    output_dir="./persuasion_focal_large",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=8,
    weight_decay=0.05,
    warmup_ratio=0.1,
    fp16=use_fp16,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


  trainer = Trainer(


In [67]:
trainer.train()

print("\n✅ Validation Results:")
print(trainer.evaluate(tokenized["validation"]))

print("\n✅ Test Results:")
print(trainer.evaluate(tokenized["test"]))


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1071,0.180001,0.558685,0.576923,0.657895,0.614754
2,0.1097,0.374307,0.600939,0.643564,0.570175,0.604651
3,0.0772,0.596543,0.610329,0.647619,0.596491,0.621005
4,0.0568,0.97742,0.558685,0.727273,0.280702,0.405063
5,0.0334,0.901074,0.58216,0.654321,0.464912,0.54359
6,0.0034,0.978607,0.591549,0.631068,0.570175,0.599078



✅ Validation Results:


{'eval_loss': 0.5965434908866882, 'eval_accuracy': 0.6103286384976526, 'eval_precision': 0.6476190476190476, 'eval_recall': 0.5964912280701754, 'eval_f1': 0.6210045662100456, 'eval_runtime': 0.9934, 'eval_samples_per_second': 214.419, 'eval_steps_per_second': 54.36, 'epoch': 6.0}

✅ Test Results:
{'eval_loss': 0.6431962251663208, 'eval_accuracy': 0.5492957746478874, 'eval_precision': 0.5765765765765766, 'eval_recall': 0.5663716814159292, 'eval_f1': 0.5714285714285714, 'eval_runtime': 0.9418, 'eval_samples_per_second': 226.171, 'eval_steps_per_second': 57.339, 'epoch': 6.0}
