<a href="https://colab.research.google.com/github/Abdulrasheed1729/polar-semeval-2026/blob/main/task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/Abdulrasheed1729/polar-semeval-2026.git --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np
from polar_semeval.semeval_dataset import SemEvalDataset, LanguageType

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    RobertaForSequenceClassification
)
from torch.utils.data import Dataset

In [4]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

In [5]:
from sklearn.model_selection import train_test_split

dataset = SemEvalDataset(subtask=2, lang_key=LanguageType.ENG)

train, val = train_test_split(dataset.dataframe, test_size=0.2, random_state=42)

In [6]:
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128, dtype = torch.long):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length
    self.dtype = dtype

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=self.dtype)
    return item

## Model Training

In [28]:
# Load the tokenizer
MODEL_NAME = 'FacebookAI/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [29]:
train_dataset = PolarizationDataset(
    train['text'].tolist(),
    train[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(),
    tokenizer,
    dtype=torch.float
)
val_dataset = PolarizationDataset(
    val['text'].tolist(),
    val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(),
    tokenizer,
    dtype=torch.float
)
# dev_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)


In [30]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=5,
    problem_type="multi_label_classification"
)

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

In [32]:
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=5,
    learning_rate=3e-5,
    per_device_train_batch_size=16,  # Increased
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",  # Save best model
    load_best_model_at_end=True,  # Load best checkpoint
    metric_for_best_model="f1_macro",
    # lr_scheduler_type = "cosine",
    # optim="adamw_torch",
    logging_steps=100,
    warmup_ratio=0.1,  # Gradual LR warmup
    weight_decay=0.01,  # Regularization
    disable_tqdm=False,
    save_total_limit=2,  # Keep only best 2 checkpoints
    # torch_compile=True,
    # torch_compile_backend="inductor"
)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.3764,0.219226,0.143158
2,0.1826,0.183437,0.273189
3,0.1538,0.213291,0.382524
4,0.0875,0.233038,0.383352
5,0.0651,0.255287,0.388691


TrainOutput(global_step=810, training_loss=0.1613226377669676, metrics={'train_runtime': 832.2154, 'train_samples_per_second': 15.483, 'train_steps_per_second': 0.973, 'total_flos': 1025122149322542.0, 'train_loss': 0.1613226377669676, 'epoch': 5.0})

In [37]:
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 2: {eval_results['eval_f1_macro']}")

Macro F1 score on validation set for Subtask 2: 0.3886905204291248


In [38]:

unlabeled_dataset_df = SemEvalDataset(lang_key=LanguageType.ENG, subtask=2, split='dev')

texts = unlabeled_dataset_df.dataframe['text'].tolist()
ids = unlabeled_dataset_df.dataframe['id'].tolist()
# ids = unlabeled_dataset_df.dataframe[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist()

labels = np.zeros((len(texts), 5))

unlabeled_dataset  =  PolarizationDataset(
    texts=texts,
    labels=labels,
    tokenizer=tokenizer,  # Make sure this is your trained tokenizer
    dtype=torch.float,
)

In [39]:
from scipy.special import expit

label_names = ['political', 'racial/ethnic', 'religious', 'gender/sexual', 'other']

print("Making predictions...")
predictions = trainer.predict(unlabeled_dataset)

# For multi-label: use sigmoid + threshold
pred_probs = expit(predictions.predictions)  # Sigmoid
pred_binary = (pred_probs > 0.5).astype(int)  # Threshold at 0.5

# Create results DataFrame
results_df = pd.DataFrame({
    'id': ids,
    'political': pred_binary[:, 0],
    'racial/ethnic': pred_binary[:, 1],
    'religious': pred_binary[:, 2],
    'gender/sexual': pred_binary[:, 3],
    'other': pred_binary[:, 4]
})

# Save to CSV
results_df.to_csv('predictions.csv', index=False)

print(f"✓ Predictions saved to predictions.csv")
print(f"Total predictions: {len(results_df)}")
print(f"\nFirst few predictions:")
print(results_df.head(10))

Making predictions...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


✓ Predictions saved to predictions.csv
Total predictions: 160

First few predictions:
                                     id  political  racial/ethnic  religious  \
0  eng_f66ca14d60851371f9720aaf4ccd9b58          0              0          0   
1  eng_3a489aa7fed9726aa8d3d4fe74c57efb          0              0          0   
2  eng_95770ff547ea5e48b0be00f385986483          0              0          0   
3  eng_2048ae6f9aa261c48e6d777bcc5b38bf          0              0          0   
4  eng_07781aa88e61e7c0a996abd1e5ea3a20          0              0          0   
5  eng_153d96f9dc27f0602c927223404d94b5          0              0          0   
6  eng_4ab5a4cc5c87d0af9cf4b80c301647bf          0              0          0   
7  eng_e75a95ba52930d6d72d503ab9469eb29          0              0          0   
8  eng_eb8fab668668f9959cafdecbfc0f081a          0              0          0   
9  eng_702724dc168d600e788d775c8e651f36          0              0          0   

   gender/sexual  other  
0      

## Novel Soution

In [None]:
!pip install git+https://github.com/Abdulrasheed1729/polar-semeval-2026.git --quiet
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
from scipy.special import expit

from polar_semeval.semeval_dataset import SemEvalDataset, LanguageType

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)

import wandb
wandb.init(mode="disabled")

# ============================================================================
# KEY CHANGES FOR BETTER GENERALIZATION
# ============================================================================

# CHANGE 1: Use a smaller, more efficient model
MODEL_NAME = 'roberta-base'  # 125M params instead of 355M (roberta-large)
# Alternative options to try:
# MODEL_NAME = 'distilroberta-base'  # Even smaller: 82M params
# MODEL_NAME = 'microsoft/deberta-v3-base'  # Often better generalization

# CHANGE 2: Add stratification for multi-label split
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

dataset = SemEvalDataset(subtask=2, lang_key=LanguageType.ENG)

# Use stratified split for multi-label data
label_columns = ['gender/sexual', 'political', 'religious', 'racial/ethnic', 'other']
X = dataset.dataframe['text'].values
y = dataset.dataframe[label_columns].values

msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(msss.split(X, y))

train = dataset.dataframe.iloc[train_idx].reset_index(drop=True)
val = dataset.dataframe.iloc[val_idx].reset_index(drop=True)

print(f"Training samples: {len(train)}")
print(f"Validation samples: {len(val)}")
print(f"Label distribution in train:\n{train[label_columns].sum()}")
print(f"Label distribution in val:\n{val[label_columns].sum()}")

# ============================================================================
# Dataset class remains the same
# ============================================================================

class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128, dtype=torch.long):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.dtype = dtype

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(label, dtype=self.dtype)
        return item

# ============================================================================
# CHANGE 3: Configure model with more regularization
# ============================================================================

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load config first to add dropout
config = AutoConfig.from_pretrained(MODEL_NAME)
config.hidden_dropout_prob = 0.3  # Increased from default 0.1
config.attention_probs_dropout_prob = 0.3  # Increased from default 0.1
config.num_labels = 5
config.problem_type = "multi_label_classification"

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    config=config
)

train_dataset = PolarizationDataset(
    train['text'].tolist(),
    train[label_columns].values.tolist(),
    tokenizer,
    dtype=torch.float
)

val_dataset = PolarizationDataset(
    val['text'].tolist(),
    val[label_columns].values.tolist(),
    tokenizer,
    dtype=torch.float
)

# ============================================================================
# Metrics function
# ============================================================================

def compute_metrics_multilabel(p):
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    preds = (probs > 0.5).int().numpy()

    # Compute multiple metrics for better monitoring
    f1_macro = f1_score(p.label_ids, preds, average='macro')
    f1_micro = f1_score(p.label_ids, preds, average='micro')

    return {
        'f1_macro': f1_macro,
        'f1_micro': f1_micro
    }

# ============================================================================
# CHANGE 4: More conservative training arguments
# ============================================================================

training_args = TrainingArguments(
    output_dir="./checkpoints",

    # REDUCED epochs - prevent overfitting
    num_train_epochs=3,  # Reduced from 5

    # LOWER learning rate for better generalization
    learning_rate=2e-5,  # Reduced from 3e-5

    # Batch sizes
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    # Evaluation and saving
    eval_strategy="steps",  # Evaluate more frequently
    eval_steps=50,  # Evaluate every 50 steps
    save_strategy="steps",
    save_steps=50,

    # Best model selection
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,

    # Regularization
    weight_decay=0.01,  # L2 regularization
    warmup_ratio=0.1,   # Gradual warmup

    # Learning rate schedule
    lr_scheduler_type="cosine",  # Smooth decay

    # Logging
    logging_steps=50,
    logging_strategy="steps",

    # Checkpointing
    save_total_limit=2,  # Only keep 2 best checkpoints

    # Other
    disable_tqdm=False,
    report_to="none",  # Disable wandb
    seed=42,
)

# ============================================================================
# CHANGE 5: Add early stopping
# ============================================================================

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,  # Stop if no improvement for 3 evaluations
    early_stopping_threshold=0.0001  # Minimum improvement threshold
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[early_stopping]  # Add early stopping
)

# ============================================================================
# Training
# ============================================================================

print("\n" + "="*50)
print("Starting training with improved configuration")
print("="*50 + "\n")

trainer.train()

# ============================================================================
# Evaluation
# ============================================================================

eval_results = trainer.evaluate()
print(f"\n{'='*50}")
print(f"VALIDATION RESULTS:")
print(f"{'='*50}")
print(f"Macro F1: {eval_results['eval_f1_macro']:.4f}")
print(f"Micro F1: {eval_results['eval_f1_micro']:.4f}")

# ============================================================================
# CHANGE 6: Analyze predictions on validation set
# ============================================================================

print(f"\n{'='*50}")
print("VALIDATION SET ANALYSIS")
print(f"{'='*50}")

val_predictions = trainer.predict(val_dataset)
val_probs = expit(val_predictions.predictions)
val_preds = (val_probs > 0.5).astype(int)

# Per-class F1 scores
for i, label in enumerate(label_columns):
    class_f1 = f1_score(val_dataset.labels[:, i], val_preds[:, i])
    print(f"{label}: F1 = {class_f1:.4f}")

# ============================================================================
# Predictions on dev set
# ============================================================================

unlabeled_dataset_df = SemEvalDataset(lang_key=LanguageType.ENG, subtask=2, split='dev')

texts = unlabeled_dataset_df.dataframe['text'].tolist()
ids = unlabeled_dataset_df.dataframe['id'].tolist()
labels = np.zeros((len(texts), 5))

unlabeled_dataset = PolarizationDataset(
    texts=texts,
    labels=labels,
    tokenizer=tokenizer,
    dtype=torch.float,
)

print("\n" + "="*50)
print("Making predictions on dev set...")
print("="*50 + "\n")

predictions = trainer.predict(unlabeled_dataset)
pred_probs = expit(predictions.predictions)
pred_binary = (pred_probs > 0.5).astype(int)

# Reorder columns to match expected format
results_df = pd.DataFrame({
    'id': ids,
    'political': pred_binary[:, 1],
    'racial/ethnic': pred_binary[:, 3],
    'religious': pred_binary[:, 2],
    'gender/sexual': pred_binary[:, 0],
    'other': pred_binary[:, 4]
})

results_df.to_csv('predictions.csv', index=False)

print(f"✓ Predictions saved to predictions.csv")
print(f"Total predictions: {len(results_df)}")
print(f"\nPrediction statistics:")
print(results_df[label_columns].sum())
print(f"\nFirst few predictions:")
print(results_df.head(10))