In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip -q install datasets sentencepiece scikit-learn accelerate --upgrade

import os, math, random, platform
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, accuracy_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import transformers, inspect
print("Python:", platform.python_version())
print("Transformers:", transformers.__version__)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/511.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m501.8/511.6 kB[0m [31m21.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m132.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 22.0.0 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "

In [None]:
import pandas as pd
import glob

train_dir = "/content/drive/MyDrive/subtask1/train/"   # <--- UPDATE PATH

train_files = glob.glob(train_dir + "*.csv")

train_dfs = []
for file in train_files:
    df = pd.read_csv(file)

    # Extract language code (before first underscore)
    lang = file.split(".")
    df["language"] = lang[0]

    train_dfs.append(df)

train_full = pd.concat(train_dfs, ignore_index=True)

print("Loaded Training Languages:", train_full["language"].unique())
print("Training Shape:", train_full.shape)


Loaded Training Languages: ['/content/drive/MyDrive/subtask1/train/urd'
 '/content/drive/MyDrive/subtask1/train/spa'
 '/content/drive/MyDrive/subtask1/train/nep'
 '/content/drive/MyDrive/subtask1/train/tur'
 '/content/drive/MyDrive/subtask1/train/amh'
 '/content/drive/MyDrive/subtask1/train/fas'
 '/content/drive/MyDrive/subtask1/train/arb'
 '/content/drive/MyDrive/subtask1/train/hin'
 '/content/drive/MyDrive/subtask1/train/zho'
 '/content/drive/MyDrive/subtask1/train/deu'
 '/content/drive/MyDrive/subtask1/train/hau'
 '/content/drive/MyDrive/subtask1/train/ita'
 '/content/drive/MyDrive/subtask1/train/eng']
Training Shape: (40395, 4)


In [None]:
test_dir = "/content/drive/MyDrive/subtask1/dev/"   # <--- UPDATE PATH

test_files = glob.glob(test_dir + "*.csv")

test_dfs = []
for file in test_files:
    df = pd.read_csv(file)

    lang = file.split(".")
    df["language"] = lang[0]

    test_dfs.append(df)

test_df = pd.concat(test_dfs, ignore_index=True)

print("Loaded Test Languages:", test_df["language"].unique())
print("Test Shape:", test_df.shape)

Loaded Test Languages: ['/content/drive/MyDrive/subtask1/dev/nep'
 '/content/drive/MyDrive/subtask1/dev/ita'
 '/content/drive/MyDrive/subtask1/dev/arb'
 '/content/drive/MyDrive/subtask1/dev/hau'
 '/content/drive/MyDrive/subtask1/dev/spa'
 '/content/drive/MyDrive/subtask1/dev/hin'
 '/content/drive/MyDrive/subtask1/dev/tur'
 '/content/drive/MyDrive/subtask1/dev/urd'
 '/content/drive/MyDrive/subtask1/dev/deu'
 '/content/drive/MyDrive/subtask1/dev/amh'
 '/content/drive/MyDrive/subtask1/dev/zho'
 '/content/drive/MyDrive/subtask1/dev/fas'
 '/content/drive/MyDrive/subtask1/dev/eng']
Test Shape: (2012, 4)


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    train_full,
    test_size=0.15,
    stratify=train_full[["polarization", "language"]],
    random_state=42
)

print("Train:", train_df.shape)
print("Validation:", val_df.shape)

Train: (34335, 4)
Validation: (6060, 4)


In [None]:
model_name = "xlm-roberta-base"   # ok for 12GB VRAM
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

max_length = 256

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=False,
        max_length=max_length
    )

In [None]:
# Build HF datasets
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

# Tokenize
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds   = val_ds.map(tokenize_function, batched=True)
test_ds  = test_ds.map(tokenize_function, batched=True)

# Keep only numeric/tensor-friendly cols for the loaders
keep_train = ["input_ids","attention_mask","polarization"]
keep_val   = ["input_ids","attention_mask","polarization"]

# For test: include label if present, otherwise no label
if "label" in test_ds.column_names:
    keep_test = ["input_ids","attention_mask","label"]
else:
    keep_test = ["input_ids","attention_mask"]

train_ds_clean = train_ds.remove_columns([c for c in train_ds.column_names if c not in keep_train])
val_ds_clean   = val_ds.remove_columns([c for c in val_ds.column_names if c not in keep_val])
test_ds_clean  = test_ds.remove_columns([c for c in test_ds.column_names if c not in keep_test])

data_collator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/34335 [00:00<?, ? examples/s]

Map:   0%|          | 0/6060 [00:00<?, ? examples/s]

Map:   0%|          | 0/2012 [00:00<?, ? examples/s]

In [None]:
# Hyperparameters (declare BEFORE creating loaders)
learning_rate = 3e-5
weight_decay   = 0.02
num_epochs     = 6
per_device_train_batch_size = 16
per_device_eval_batch_size  = 64
grad_accumulation = 2
warmup_ratio   = 0.06
max_grad_norm  = 1.0

In [None]:
train_loader = DataLoader(
    train_ds_clean,
    batch_size=per_device_train_batch_size,
    shuffle=True,
    collate_fn=data_collator,
    pin_memory=True
)

val_loader = DataLoader(
    val_ds_clean,
    batch_size=per_device_eval_batch_size,
    shuffle=False,
    collate_fn=data_collator,
    pin_memory=True
)

test_loader = DataLoader(
    test_ds_clean,
    batch_size=per_device_eval_batch_size,
    shuffle=False,
    collate_fn=data_collator,
    pin_memory=True
)

In [None]:
classes = np.array(sorted(train_df["polarization"].unique()), dtype=int)
num_labels = len(classes)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=train_df["polarization"].values
)
class_weights = torch.tensor(class_weights, dtype=torch.float, device=device)

print("Classes:", classes)
print("Class weights:", class_weights)

Classes: [0 1]
Class weights: tensor([1.0419, 0.9613], device='cuda:0')


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="single_label_classification"
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9,0.98), weight_decay=weight_decay)

# steps calc AFTER loaders exist
updates_per_epoch = math.ceil(len(train_loader) / grad_accumulation)
total_training_steps = updates_per_epoch * num_epochs
warmup_steps = int(warmup_ratio * total_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_training_steps
)

scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
ce_loss = nn.CrossEntropyLoss(weight=class_weights)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())


In [None]:
@torch.no_grad()
def evaluate(model, data_loader, expect_labels=True):
    model.eval()
    all_preds, all_labels = [], []
    for batch in data_loader:
        inputs = {k: v.to(device) for k,v in batch.items() if k in ("input_ids","attention_mask")}
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        all_preds.append(preds)
        if expect_labels and "polarization" in batch:
            all_labels.append(batch["polarization"].cpu().numpy())

    all_preds = np.concatenate(all_preds)
    if expect_labels and len(all_labels) > 0:
        all_labels = np.concatenate(all_labels)
        f1m = f1_score(all_labels, all_preds, average="macro")
        acc = accuracy_score(all_labels, all_preds)
        return f1m, acc, all_preds, all_labels
    return None, None, all_preds, None


def per_language_f1(labels_all: np.ndarray, preds_all: np.ndarray, langs_all: np.ndarray):
    scores = {}
    for lg in sorted(np.unique(langs_all)):
        idx = np.where(langs_all == lg)[0]
        if idx.size == 0:
            continue
        y_true = labels_all[idx]
        y_pred = preds_all[idx]
        scores[lg] = f1_score(y_true, y_pred, average="macro")
    return scores

In [None]:
print(val_ds.column_names)
print(val_ds_clean.column_names)

['id', 'text', 'polarization', 'language', 'input_ids', 'attention_mask']
['polarization', 'input_ids', 'attention_mask']


In [None]:
save_dir = "./results_multilingual_best"
os.makedirs(save_dir, exist_ok=True)

patience = 3
best_f1 = -1.0
epochs_no_improve = 0

for epoch in range(1, num_epochs+1):
    model.train()
    running_loss = 0.0
    optimizer.zero_grad(set_to_none=True)

    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch}/{num_epochs}")
    for step, batch in pbar:

        # FIX: label key is "label" not "labels"
        labels = batch["polarization"].to(device)

        inputs = {
            k: v.to(device)
            for k, v in batch.items()
            if k in ("input_ids", "attention_mask")
        }

        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            outputs = model(**inputs)
            loss = ce_loss(outputs.logits, labels) / grad_accumulation

        scaler.scale(loss).backward()
        running_loss += loss.item()

        if (step + 1) % grad_accumulation == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        pbar.set_postfix({"loss": f"{running_loss / (step+1):.4f}"})

    # Validation after each epoch
    val_f1, val_acc, _, _ = evaluate(model, val_loader, expect_labels=True)
    print(f"\nEpoch {epoch}: Val Macro-F1={val_f1:.4f} | Val Acc={val_acc:.4f}")

    # Early stopping + best checkpoint
    if val_f1 is not None and val_f1 > best_f1:
        best_f1 = val_f1
        epochs_no_improve = 0
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"💾 Saved new best model to {save_dir} (F1={best_f1:.4f})")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"⏹ Early stopping at epoch {epoch} (best F1={best_f1:.4f})")
            break

print("Best Val Macro-F1:", best_f1)

Epoch 1/6:   0%|          | 0/2146 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):



Epoch 1: Val Macro-F1=0.7823 | Val Acc=0.7848
💾 Saved new best model to ./results_multilingual_best (F1=0.7823)


Epoch 2/6:   0%|          | 0/2146 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):



Epoch 2: Val Macro-F1=0.7846 | Val Acc=0.7847
💾 Saved new best model to ./results_multilingual_best (F1=0.7846)


Epoch 3/6:   0%|          | 0/2146 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):



Epoch 3: Val Macro-F1=0.7876 | Val Acc=0.7876
💾 Saved new best model to ./results_multilingual_best (F1=0.7876)


Epoch 4/6:   0%|          | 0/2146 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):



Epoch 4: Val Macro-F1=0.7835 | Val Acc=0.7840


Epoch 5/6:   0%|          | 0/2146 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):



Epoch 5: Val Macro-F1=0.7846 | Val Acc=0.7847


Epoch 6/6:   0%|          | 0/2146 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):



Epoch 6: Val Macro-F1=0.7846 | Val Acc=0.7847
⏹ Early stopping at epoch 6 (best F1=0.7876)
Best Val Macro-F1: 0.7876220910472915


In [None]:
# Reload best checkpoint
best_model = AutoModelForSequenceClassification.from_pretrained(save_dir).to(device)

# Validation
val_f1, val_acc, _, _ = evaluate(best_model, val_loader, expect_labels=True)
print(f"Best model - Val Macro-F1={val_f1:.4f} | Val Acc={val_acc:.4f}")

# Test overall (expects labels if present)
expect_labels_test = "label" in test_ds_clean.column_names
test_f1, test_acc, test_preds, test_labels = evaluate(best_model, test_loader, expect_labels=expect_labels_test)

if expect_labels_test:
    print(f"Test Macro-F1={test_f1:.4f} | Test Acc={test_acc:.4f}")

    # Per-language using pandas test_df (kept original order)
    langs_test = np.array(test_df["language"])
    per_lang = per_language_f1(test_labels, test_preds, langs_test)
    print("Per-language F1:", per_lang)
    print("Average across languages:", np.mean(list(per_lang.values())))
else:
    print("Test predictions computed (no labels present).")

Best model - Val Macro-F1=0.7876 | Val Acc=0.7876
Test predictions computed (no labels present).
