Download pretrained model.

In [None]:
!pip install --upgrade "accelerate>=0.26.0"

In [None]:
!pip install transformers

In [1]:
# Disable wandb to prevent interactive prompts
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "bert-base-multilingual-cased"
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Loading model and tokenizer...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unpack training data.

In [None]:
!unzip subtask1.zip -d /content/

Load training data. (from POLAR@SEM-EVAL2026 https://polar-semeval.github.io/tasks.html)

In [6]:
import glob
import os
from datasets import load_dataset

# Check current directory
print(f"Current directory: {os.getcwd()}")

# Collect all CSV paths for train and dev
train_files = glob.glob("train/*.csv")
dev_files = glob.glob("dev/*.csv")

# Load them into a DatasetDict
dataset = load_dataset(
    "csv",
    data_files={"train": train_files, "test": dev_files},
    column_names=["id", "text", "label"]  # 'polarization' mapped to 'label' for Trainer compatibility
)

Current directory: /content


Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Tokenize data.

In [7]:
import glob
import torch
from datasets import load_dataset
from transformers import AutoTokenizer

# 1. Setup Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# 2. Identify Files - ONLY use train (dev has no labels!)
train_files = glob.glob("train/*.csv")

# 3. Load CSVs with proper header handling
raw_datasets = load_dataset(
    "csv",
    data_files={"train": train_files},
    # Don't override column_names - let it read the header
)

# Rename 'polarization' to 'label' for Trainer compatibility
raw_datasets = raw_datasets.rename_column("polarization", "label")

# 4. ROBUST PREPROCESSING
def clean_and_tokenize(examples):
    clean_labels = []
    for val in examples["label"]:
        try:
            clean_labels.append(int(str(val).strip()))
        except (ValueError, TypeError):
            clean_labels.append(0)

    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

    tokenized["labels"] = clean_labels
    return tokenized

# 5. Apply preprocessing
tokenized_datasets = raw_datasets.map(
    clean_and_tokenize,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

# 6. Split training data: 90% train, 10% eval (since dev has no labels)
split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=42)
shuffled_datasets = {
    "train": split_datasets["train"].shuffle(seed=42),
    "test": split_datasets["test"]  # This is our holdout for evaluation
}

print(f"Training samples: {len(shuffled_datasets['train'])}")
print(f"Evaluation samples: {len(shuffled_datasets['test'])}")
print("Sample labels:", shuffled_datasets["train"]["labels"][:20])

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/73681 [00:00<?, ? examples/s]

Training samples: 66312
Evaluation samples: 7369
Sample labels: [0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1]


Define evaluation metrics.

In [8]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # mBERT outputs logits; we take the argmax to get the 0 or 1 prediction
    predictions = np.argmax(logits, axis=-1)

    # 'macro' average treats both classes (0 and 1) as equally important
    # regardless of how many samples each has in your CSVs.
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='macro'
    )
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1_macro': f1,
        'precision_macro': precision,
        'recall_macro': recall
    }

Test baseline performance. (on **GPU**!)

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
import pandas as pd
import glob
import os
from tqdm import tqdm

# Load model with classification head
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Set up arguments for prediction
pred_args = TrainingArguments(
    output_dir="./baseline",
    per_device_eval_batch_size=32,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=pred_args,
)

# Create baseline output directory
os.makedirs("./baseline", exist_ok=True)

# Get all dev files (these have ids but no labels - for submission)
dev_files = glob.glob("dev/*.csv")

print(f"Generating baseline predictions for {len(dev_files)} languages...")

for dev_file in tqdm(dev_files):
    # Extract language code from filename (e.g., "dev/eng.csv" -> "eng")
    lang_code = os.path.basename(dev_file).replace('.csv', '')
    
    # Load dev data
    dev_df = pd.read_csv(dev_file)
    
    # Tokenize the text
    dev_encodings = tokenizer(
        dev_df['text'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    # Create a simple dataset
    class SimpleDataset:
        def __init__(self, encodings):
            self.encodings = encodings
        
        def __getitem__(self, idx):
            return {key: val[idx] for key, val in self.encodings.items()}
        
        def __len__(self):
            return len(self.encodings['input_ids'])
    
    dev_dataset = SimpleDataset(dev_encodings)
    
    # Get predictions
    predictions = trainer.predict(dev_dataset)

    # DEBUG: Check the raw logits
    print(f"\nDEBUG - First 5 samples for {lang_code}:")
    print("Raw logits (class 0, class 1):")
    print(predictions.predictions[:5])
    print("\nArgmax predictions:")
    print(predictions.predictions.argmax(axis=-1)[:5])

    pred_labels = predictions.predictions.argmax(axis=-1)
    
    # Create submission dataframe
    submission_df = pd.DataFrame({
        'id': dev_df['id'],
        'polarization': pred_labels
    })
    
    # Save in submission format
    output_path = f"./baseline/pred_{lang_code}.csv"
    submission_df.to_csv(output_path, index=False)
    
    print(f"  ✓ {lang_code}: {len(submission_df)} predictions -> {output_path}")

print(f"\n✓ All baseline predictions saved to ./baseline/")
print(f"  Files: pred_{{lang_code}}.csv for each language")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Generating baseline predictions for 22 languages...


  0%|          | 0/22 [00:00<?, ?it/s]

  5%|▍         | 1/22 [00:00<00:08,  2.43it/s]

  ✓ rus: 167 predictions -> ./baseline/pred_rus.csv


  9%|▉         | 2/22 [00:00<00:10,  1.98it/s]

  ✓ khm: 332 predictions -> ./baseline/pred_khm.csv


 14%|█▎        | 3/22 [00:01<00:06,  2.81it/s]

  ✓ pan: 100 predictions -> ./baseline/pred_pan.csv


 18%|█▊        | 4/22 [00:01<00:05,  3.05it/s]

  ✓ deu: 159 predictions -> ./baseline/pred_deu.csv


 23%|██▎       | 5/22 [00:01<00:05,  3.19it/s]

  ✓ spa: 165 predictions -> ./baseline/pred_spa.csv


 27%|██▋       | 6/22 [00:02<00:05,  3.14it/s]

  ✓ hau: 182 predictions -> ./baseline/pred_hau.csv


 32%|███▏      | 7/22 [00:02<00:04,  3.64it/s]

  ✓ nep: 100 predictions -> ./baseline/pred_nep.csv


 36%|███▋      | 8/22 [00:02<00:03,  3.69it/s]

  ✓ mya: 144 predictions -> ./baseline/pred_mya.csv


 41%|████      | 9/22 [00:02<00:03,  3.97it/s]

  ✓ tel: 118 predictions -> ./baseline/pred_tel.csv


 45%|████▌     | 10/22 [00:03<00:03,  3.48it/s]

  ✓ zho: 214 predictions -> ./baseline/pred_zho.csv


 50%|█████     | 11/22 [00:03<00:03,  3.44it/s]

  ✓ arb: 169 predictions -> ./baseline/pred_arb.csv


 55%|█████▍    | 12/22 [00:03<00:02,  3.35it/s]

  ✓ urd: 177 predictions -> ./baseline/pred_urd.csv


 59%|█████▉    | 13/22 [00:03<00:02,  3.54it/s]

  ✓ hin: 137 predictions -> ./baseline/pred_hin.csv


 64%|██████▎   | 14/22 [00:04<00:02,  3.86it/s]

  ✓ tur: 115 predictions -> ./baseline/pred_tur.csv


 68%|██████▊   | 15/22 [00:04<00:01,  3.69it/s]

  ✓ ita: 166 predictions -> ./baseline/pred_ita.csv


 73%|███████▎  | 16/22 [00:04<00:01,  3.55it/s]

  ✓ ben: 166 predictions -> ./baseline/pred_ben.csv


 77%|███████▋  | 17/22 [00:04<00:01,  3.86it/s]

  ✓ ori: 118 predictions -> ./baseline/pred_ori.csv


 82%|████████▏ | 18/22 [00:05<00:01,  3.81it/s]

  ✓ eng: 160 predictions -> ./baseline/pred_eng.csv


 86%|████████▋ | 19/22 [00:05<00:01,  2.76it/s]

  ✓ swa: 349 predictions -> ./baseline/pred_swa.csv


 91%|█████████ | 20/22 [00:06<00:00,  2.92it/s]

  ✓ fas: 164 predictions -> ./baseline/pred_fas.csv


 95%|█████████▌| 21/22 [00:06<00:00,  3.06it/s]

  ✓ amh: 166 predictions -> ./baseline/pred_amh.csv


100%|██████████| 22/22 [00:06<00:00,  3.32it/s]

  ✓ pol: 119 predictions -> ./baseline/pred_pol.csv

✓ All baseline predictions saved to ./baseline/
  Files: pred_{lang_code}.csv for each language





Train the classification head.

In [10]:
import gc
import torch

# Delete any existing model/trainer from previous cells
for var_name in ['model', 'trainer', 'baseline_results']:
    if var_name in dir():
        exec(f"del {var_name}")

# Force garbage collection
gc.collect()

# Clear all CUDA memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()
    
    # Show current memory state
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    
    print(f"GPU Memory after cleanup:")
    print(f"  Allocated: {allocated:.2f} GB")
    print(f"  Reserved:  {reserved:.2f} GB")
    print(f"  Total:     {total:.2f} GB")
    print(f"  Free:      {total - reserved:.2f} GB")

GPU Memory after cleanup:
  Allocated: 0.01 GB
  Reserved:  0.02 GB
  Total:     15.83 GB
  Free:      15.81 GB


In [11]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
import torch
import os

# Memory optimization environment variable
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

# Load fresh model with classification head
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# Training arguments - SPEED OPTIMIZED for T4 GPU with clean memory
training_args = TrainingArguments(
    output_dir="./mbert_polarization",
    
    # Training settings - MAXIMIZED for speed
    num_train_epochs=3,
    per_device_train_batch_size=16,   # Increased from 4
    per_device_eval_batch_size=32,    # Increased from 8
    gradient_accumulation_steps=2,    # Reduced: effective batch = 16 * 2 = 32
    
    # Learning rate schedule
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    
    # Evaluation & saving - less frequent = faster
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    
    # Logging - less frequent
    logging_steps=200,
    report_to="none",
    
    # T4 SPEED OPTIMIZATIONS
    fp16=True,                        # Tensor Cores = 2-3x faster
    bf16=False,                       # T4 doesn't support bf16
    dataloader_pin_memory=True,       # Faster CPU->GPU transfer
    dataloader_num_workers=4,         # More parallel data loading
    optim="adamw_torch_fused",        # Fused optimizer (faster)
    torch_compile=False,              # Skip compilation overhead
    
    # No gradient checkpointing = faster (uses more memory but we have it now)
    gradient_checkpointing=False,
    max_grad_norm=1.0,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=shuffled_datasets["train"],
    eval_dataset=shuffled_datasets["test"],
    compute_metrics=compute_metrics,
)

# Train!
print("Starting training (T4 GPU - SPEED optimized)...")
print(f"Using device: {trainer.args.device}")
print(f"Batch size: {training_args.per_device_train_batch_size} x {training_args.gradient_accumulation_steps} = {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
trainer.train()

# Final evaluation
final_results = trainer.evaluate()
print(f"\n=== Training Complete ===")
print(f"Final Macro F1: {final_results['eval_f1_macro']:.4f}")
print(f"Final Accuracy: {final_results['eval_accuracy']:.4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training (T4 GPU - SPEED optimized)...
Using device: cuda:0
Batch size: 16 x 2 = 32




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
import torch

print("=== GPU Check ===")
print(f"PyTorch version: {torch.__version__}")

# CUDA (NVIDIA GPU)
print(f"\nCUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# MPS (Apple Silicon)
print(f"\nMPS available: {torch.backends.mps.is_available()}")
if torch.backends.mps.is_available():
    print("MPS device: Apple Silicon GPU")

# What will be used
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

print(f"\n→ Training will use: {device.upper()}")