In [1]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device.upper()}")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Device: CUDA


In [2]:
dataset = load_dataset("holistic-ai/EMGSD")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    tokenized = tokenizer(examples["text"], truncation=True, max_length=128)
    
    labels = []
    for l in examples["label"]:
        if isinstance(l, str) and (l.startswith("stereotype") or l == "related"):
            labels.append(1)
        else:
            labels.append(0)
    tokenized["labels"] = labels
    return tokenized

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)



Map:   0%|          | 0/45760 [00:00<?, ? examples/s]

Map:   0%|          | 0/11441 [00:00<?, ? examples/s]

In [3]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
).to(device)

# Freeze BERT backbone parameters to prevent overfitting
for param in model.bert.parameters():
    param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='macro'
    )
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1_macro': f1
    }

In [5]:
training_args = TrainingArguments(
    output_dir="./hearts_bert_baseline_frozen",
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    fp16=(device == "cuda"),
    
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
    
    logging_steps=50,
    report_to="none"
)

In [6]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.6383,0.620668,0.660082,0.424285
2,0.6197,0.610024,0.661568,0.42604
3,0.611,0.608287,0.679486,0.537961
4,0.6114,0.600866,0.678437,0.520655
5,0.599,0.596377,0.672581,0.482579
6,0.6174,0.595942,0.682108,0.541486


KeyboardInterrupt: 

In [None]:
final_metrics = trainer.evaluate()
print(f"Final Macro F1: {final_metrics['eval_f1_macro']:.4f}")
print(f"Final Validation Loss: {final_metrics['eval_loss']:.4f}")

trainer.save_model("./final_baseline_model")