In [1]:
# Install gdown if needed
# Replace the ID below with your actual file ID from the Drive link
# (The ID is the long string of random characters in the URL)
file_id = '1xQvIBwYFTqJRfU77LSqIrRJlsnpdMQGv'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'dev_phase.zip'

!gdown {url} -O {output}

!unzip {output}

# Delete __MACOSX directory (if exists) and the dev_phase.zip file (cleanup)
import os
import shutil

if os.path.exists("__MACOSX"):
    shutil.rmtree("__MACOSX")

if os.path.exists("dev_phase.zip"):
    os.remove("dev_phase.zip")

Downloading...
From: https://drive.google.com/uc?id=1xQvIBwYFTqJRfU77LSqIrRJlsnpdMQGv
To: /content/dev_phase.zip
100% 12.0M/12.0M [00:00<00:00, 44.2MB/s]
Archive:  dev_phase.zip
   creating: dev_phase/
   creating: dev_phase/subtask2/
   creating: dev_phase/subtask3/
   creating: dev_phase/subtask1/
   creating: dev_phase/subtask2/train/
   creating: dev_phase/subtask2/dev/
   creating: dev_phase/subtask3/train/
   creating: dev_phase/subtask3/dev/
   creating: dev_phase/subtask1/train/
   creating: dev_phase/subtask1/dev/
  inflating: dev_phase/subtask2/train/arb.csv  
  inflating: dev_phase/subtask2/train/fas.csv  
  inflating: dev_phase/subtask2/train/eng_augmented.csv  
  inflating: __MACOSX/dev_phase/subtask2/train/._eng_augmented.csv  
  inflating: dev_phase/subtask2/train/zho.csv  
  inflating: dev_phase/subtask2/train/nep.csv  
  inflating: dev_phase/subtask2/train/spa.csv  
  inflating: dev_phase/subtask2/train/ita.csv  
  inflating: dev_phase/subtask2/train/urd.csv  
  inflat

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from transformers import AutoModel
# from logs import log

In [3]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

# CONFIG
NUM_TYPES = 5
NUM_MANIFESTATIONS = 6
datasets_merge = True
lang = "arb"
trial_id = "0000NG3"
model_names = ['bert-base-uncased', "UBC-NLP/MARBERTv2", "microsoft/deberta-v3-base", "FacebookAI/xlm-roberta-large", "0ssamaak0/roberta-base-LEGO_emotions", "FacebookAI/roberta-base", "cardiffnlp/twitter-roberta-base", "cardiffnlp/roberta-base-emotion", "UBC-NLP/ARBERTv2"]
model_name = model_names[-1]

In [4]:
train_1 = pd.read_csv("./dev_phase/subtask1/train/" + lang + "_augmented.csv")
train_2 = pd.read_csv("./dev_phase/subtask2/train/" + lang + "_augmented.csv")
train_3 = pd.read_csv("./dev_phase/subtask3/train/" + lang + "_augmented.csv")
dev_df = pd.read_csv("./dev_phase/subtask1/dev/" + lang + ".csv")

In [5]:
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

import torch

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
from tqdm.auto import tqdm

In [6]:
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.float)
    return item

In [7]:
from sklearn.model_selection import train_test_split
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)


train_datasets = []
val_datasets = []

# Prepare label columns separately for each task, fallback to the correct columns per train DataFrame
def get_label_columns(df):
    return [col for col in df.columns if col not in ['id', 'text']]

# Split indices once and reuse for all datasets to ensure same split
n_samples = len(train_1)
indices = np.arange(n_samples)
train_indices, val_indices = train_test_split(
    indices,
    test_size=0.2,
    random_state=42
)

if datasets_merge:
    # Merge all datasets on 'id'
    merged = train_1.merge(train_2, on=['id', 'text'], how='outer', suffixes=('_1', '_2'))
    # For the third, avoid duplicate columns of 'text', so drop redundant one, or merge only on id
    merged = merged.merge(train_3, on=['id', 'text'], how='outer', suffixes=('', '_3'))
    # Get label columns: all columns excluding 'id' and 'text'
    merged_label_columns = get_label_columns(merged)
    texts = merged['text'].tolist()
    labels = merged[merged_label_columns].values.tolist()
    texts_train = [texts[i] for i in train_indices]
    texts_val = [texts[i] for i in val_indices]
    labels_train = [labels[i] for i in train_indices]
    labels_val = [labels[i] for i in val_indices]
    train_dataset = PolarizationDataset(texts_train, labels_train, tokenizer)
    val_dataset = PolarizationDataset(texts_val, labels_val, tokenizer)
else:
    # Apply the same split to all three datasets
    for train in [train_1, train_2, train_3]:
        current_label_columns = get_label_columns(train)
        texts = train['text'].tolist()
        
        # Use the same indices for all datasets
        texts_train = [texts[i] for i in train_indices]
        texts_val = [texts[i] for i in val_indices]
        
        if current_label_columns:
            labels = train[current_label_columns].values.tolist()
            labels_train = [labels[i] for i in train_indices]
            labels_val = [labels[i] for i in val_indices]
        else:
            labels_train = [[] for _ in texts_train]
            labels_val = [[] for _ in texts_val]
        
        train_datasets.append(PolarizationDataset(texts_train, labels_train, tokenizer))
        val_datasets.append(PolarizationDataset(texts_val, labels_val, tokenizer))

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

In [8]:
def get_pos_weights(labels_matrix):
    # labels_matrix is a list of lists or numpy array
    labels_np = np.array(labels_matrix)
    num_pos = labels_np.sum(axis=0)
    num_neg = len(labels_np) - num_pos
    
    # Simple ratio: if 10 pos and 90 neg, weight is 9.0
    # Add a small epsilon to avoid division by zero
    weights = num_neg / (num_pos + 1e-5)
    return torch.tensor(weights, dtype=torch.float)

pos_weight_2 = torch.ones([NUM_TYPES]) * 5.0  # Penalize missing a type 5x more
pos_weight_3 = torch.ones([NUM_MANIFESTATIONS]) * 5.0

In [9]:
class SharedMTLModel(nn.Module):
    def __init__(self, model_name, num_types, num_manifestations, pos_weight_2=None, pos_weight_3=None):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        self.num_types = num_types
        self.num_manifestations = num_manifestations

        self.dropout = nn.Dropout(0.2)

        self.head1 = nn.Linear(hidden_size, 1)
        self.head2 = nn.Linear(hidden_size, num_types)
        self.head3 = nn.Linear(hidden_size, num_manifestations)

        self.register_buffer("pos_weight_2", pos_weight_2 if pos_weight_2 is not None else torch.tensor([]))
        self.register_buffer("pos_weight_3", pos_weight_3 if pos_weight_3 is not None else torch.tensor([]))

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        H = self.dropout(outputs.last_hidden_state[:, 0, :])

        logits1 = self.head1(H)
        logits2 = self.head2(H)
        logits3 = self.head3(H)
        logits = torch.cat([logits1, logits2, logits3], dim=-1)

        loss = None
        if labels is not None:
            labels = labels.float()
            y1_true = labels[:, :1]
            y2_true = labels[:, 1:1 + self.num_types]
            y3_true = labels[:, 1 + self.num_types:]

            device = logits1.device

            loss1 = nn.BCEWithLogitsLoss()(logits1, y1_true)

            if self.pos_weight_2.numel() > 0:
                loss2 = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight_2.to(device))(logits2, y2_true)
            else:
                loss2 = nn.BCEWithLogitsLoss()(logits2, y2_true)

            if self.pos_weight_3.numel() > 0:
                loss3 = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight_3.to(device))(logits3, y3_true)
            else:
                loss3 = nn.BCEWithLogitsLoss()(logits3, y3_true)

            loss = (loss1 + loss2 + loss3) / 3.0

        return {"loss": loss, "logits": logits}


In [10]:
def compute_pos_weights(df, label_cols):
    labels = df[label_cols].values
    pos = labels.sum(axis=0)
    neg = (labels == 0).sum(axis=0)
    weights = torch.tensor(neg / (pos + 1e-5), dtype=torch.float)
    return weights

pos_weight_2 = compute_pos_weights(train_2, train_2.columns[2:])
pos_weight_3 = compute_pos_weights(train_3, train_3.columns[2:])

model = SharedMTLModel(model_name, NUM_TYPES, NUM_MANIFESTATIONS, pos_weight_2, pos_weight_3)

config.json:   0%|          | 0.00/753 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]

In [11]:
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    if isinstance(logits, tuple):
        logits = logits[0]

    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= 0.5).astype(int)
    labels = labels.astype(int)

    # Extract Binary Classification (Subtask 1)
    y1_true = labels[:, 0]
    y1_pred = preds[:, 0]

    # --- LOGICAL GATING START ---
    # Create a mask from Subtask 1 predictions
    # Shape becomes (Batch_Size, 1) to broadcast over the other subtasks
    mask = y1_pred[:, None] 
    
    # Extract raw predictions for Subtask 2 & 3
    y2_pred_raw = preds[:, 1:1+NUM_TYPES]
    y3_pred_raw = preds[:, 1+NUM_TYPES:]
    
    # Apply the mask: If y1_pred is 0, force y2 and y3 to be 0
    y2_pred = y2_pred_raw * mask
    y3_pred = y3_pred_raw * mask
    # --- LOGICAL GATING END ---

    # Extract True Labels for Subtask 2 & 3
    y2_true = labels[:, 1:1+NUM_TYPES]
    y3_true = labels[:, 1+NUM_TYPES:]

    return {
        "subtask_1/f1_macro": f1_score(y1_true, y1_pred, average="macro", zero_division=0),
        "subtask_2/f1_macro": f1_score(y2_true, y2_pred, average="macro", zero_division=0),
        "subtask_3/f1_macro": f1_score(y3_true, y3_pred, average="macro", zero_division=0),    }

from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir=f"./results/{trial_id}",
    num_train_epochs=15,              # Increase max epochs, let EarlyStopping handle the cut
    learning_rate=2e-5,
    per_device_train_batch_size=32,   # 64 might be too stable? 32 adds some noise (good for regularization)
    per_device_eval_batch_size=16,
    weight_decay=0.01,                # Add Weight Decay!
    eval_strategy="epoch",
    save_strategy="epoch",            # Must save to load best
    load_best_model_at_end=True,      # Automatically load the best checkpoint
    metric_for_best_model="eval_subtask_1/f1_macro", # Optimize for the hardest metric or the main task
    save_total_limit=2,               # Don't fill disk
    logging_steps=50,
)

In [12]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Stop if no improvement for 3 epochs
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(
    "Validation Results:",
    f"\nsubtask_1 f1_macro: {eval_results['eval_subtask_1/f1_macro']:.4f}",
    f"\nsubtask_2 f1_macro: {eval_results['eval_subtask_2/f1_macro']:.4f}",
    f"\nsubtask_3 f1_macro: {eval_results['eval_subtask_3/f1_macro']:.4f}",
)

Epoch,Training Loss,Validation Loss,Subtask 1/f1 Macro,Subtask 2/f1 Macro,Subtask 3/f1 Macro
1,0.8015,0.593499,0.816061,0.581012,0.585893
2,0.5085,0.556582,0.842954,0.610778,0.614589
3,0.3629,0.657979,0.836043,0.628343,0.6238
4,0.2712,0.677746,0.848204,0.64596,0.631834
5,0.2248,0.717257,0.84438,0.682468,0.646173
6,0.183,0.745745,0.85185,0.659754,0.644457
7,0.162,0.784631,0.849525,0.672327,0.66784
8,0.1322,0.77189,0.852097,0.681239,0.665268
9,0.1088,0.827781,0.858341,0.691743,0.672751
10,0.0948,0.829893,0.859869,0.695349,0.673462


Validation Results: 
subtask_1 f1_macro: 0.8599 
subtask_2 f1_macro: 0.6953 
subtask_3 f1_macro: 0.6735


In [13]:
# MARBERTv2 RESULTS
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Stop if no improvement for 3 epochs
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(
    "Validation Results:",
    f"\nsubtask_1 f1_macro: {eval_results['eval_subtask_1/f1_macro']:.4f}",
    f"\nsubtask_2 f1_macro: {eval_results['eval_subtask_2/f1_macro']:.4f}",
    f"\nsubtask_3 f1_macro: {eval_results['eval_subtask_3/f1_macro']:.4f}",
)

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# Log Metrics

In [None]:
# Log the experiment results - each subtask separately

# Prepare metadata for the experiment
experiment_metadata = {
    "approach": "MTL_no_gate",
    f"model_{lang}": model_name,
    "learning_rate": training_args.learning_rate,
    "num_train_epochs": training_args.num_train_epochs,
    "per_device_train_batch_size": training_args.per_device_train_batch_size,
    "per_device_eval_batch_size": training_args.per_device_eval_batch_size,
    "num_types": NUM_TYPES,
    "num_manifestations": NUM_MANIFESTATIONS,
    "datasets_merge": datasets_merge,
    "posweight": "True"
}

# Only log eval_f1_macro for each subtask
subtask_1_results = {
    "eval_f1_macro": eval_results.get("eval_subtask_1/f1_macro"),
}
subtask_2_results = {
    "eval_f1_macro": eval_results.get("eval_subtask_2/f1_macro"),
}
subtask_3_results = {
    "eval_f1_macro": eval_results.get("eval_subtask_3/f1_macro"),
}

import json

# Attempt to load existing logs and merge metadata for this trial if present
existing_metadata = {}
try:
    with open("logs.json", "r", encoding="utf-8") as f:
        logs = json.load(f)
        if isinstance(logs, dict):
            logs = [logs]
        for trial in logs:
            if trial.get("trial_id") == trial_id and "metadata" in trial:
                existing_metadata = trial["metadata"].copy()
                break
except (FileNotFoundError, json.JSONDecodeError):
    pass

# Only add/replace model_{lang}, don't overwrite the whole metadata
merged_metadata = dict(existing_metadata)
merged_metadata.update({
    f"model_{lang}": model_name,
    "approach": experiment_metadata["approach"],
    "learning_rate": experiment_metadata["learning_rate"],
    "num_train_epochs": experiment_metadata["num_train_epochs"],
    "per_device_train_batch_size": experiment_metadata["per_device_train_batch_size"],
    "per_device_eval_batch_size": experiment_metadata["per_device_eval_batch_size"],
    "num_types": experiment_metadata["num_types"],
    "num_manifestations": experiment_metadata["num_manifestations"],
    "datasets_merge": experiment_metadata["datasets_merge"]
})

log(
    subtask_name="subtask_1",
    language=lang,
    eval_results=subtask_1_results,
    metadata=merged_metadata,
    trial_id=trial_id
)

# Log subtask_2 and subtask_3 using the same trial_id and do not pass metadata to avoid overwrite
log(
    subtask_name="subtask_2",
    language=lang,
    eval_results=subtask_2_results,
    metadata=None,
    trial_id=trial_id
)

log(
    subtask_name="subtask_3",
    language=lang,
    eval_results=subtask_3_results,
    metadata=None,
    trial_id=trial_id
)

print(f"\n✓ Experiment results logged to logs.json (trial_id: {trial_id})")
print(f"  - subtask_1: {lang}")
print(f"  - subtask_2: {lang}")
print(f"  - subtask_3: {lang}")


✓ Experiment results logged to logs.json (trial_id: 0000NG3)
  - subtask_1: eng
  - subtask_2: eng
  - subtask_3: eng


# Predict on the dev set

In [14]:
import os

# Load dev 1 and predict all 3 dev sets
dev_1 = pd.read_csv(f"./dev_phase/subtask1/dev/{lang}.csv")
dev_2 = pd.read_csv(f"./dev_phase/subtask2/dev/{lang}.csv")
dev_3 = pd.read_csv(f"./dev_phase/subtask3/dev/{lang}.csv")

# Create dataset from dev 1 texts (all dev sets have same texts)
dev_texts = dev_1['text'].tolist()
# Dummy labels for prediction
dev_dataset = PolarizationDataset(dev_texts, [[0]*12]*len(dev_texts), tokenizer)

# Predict
predictions = trainer.predict(dev_dataset)
logits = predictions.predictions
if isinstance(logits, tuple):
    logits = logits[0]
probs = 1 / (1 + np.exp(-logits))
preds = (probs >= 0.5).astype(int)

# Extract predictions for Subtask 1
polarization_preds = preds[:, 0]

# --- LOGICAL GATING START ---
# Create mask based on Subtask 1 (N, 1)
mask = polarization_preds[:, None]

# Apply mask to Subtasks 2 and 3
# If polarization is 0, these predictions become 0 regardless of model output
types_preds = preds[:, 1:1+NUM_TYPES] * mask
manifestations_preds = preds[:, 1+NUM_TYPES:] * mask
# --- LOGICAL GATING END ---

# Create output DataFrames
output_1 = dev_1[['id', 'text']].copy()
output_1['polarization'] = polarization_preds

output_2 = dev_2[['id', 'text']].copy()
type_cols = [col for col in dev_2.columns if col not in ['id', 'text']]
for i, col in enumerate(type_cols):
    output_2[col] = types_preds[:, i]

output_3 = dev_3[['id', 'text']].copy()
manifest_cols = [col for col in dev_3.columns if col not in ['id', 'text']]
for i, col in enumerate(manifest_cols):
    output_3[col] = manifestations_preds[:, i]

# Drop the 'text' column before saving
output_1 = output_1.drop(columns=['text'])
output_2 = output_2.drop(columns=['text'])
output_3 = output_3.drop(columns=['text'])

# Create dir under results with trial_id
os.makedirs(f"./results/{trial_id}", exist_ok=True)
os.makedirs(f"./results/{trial_id}/subtask_1", exist_ok=True)
os.makedirs(f"./results/{trial_id}/subtask_2", exist_ok=True)
os.makedirs(f"./results/{trial_id}/subtask_3", exist_ok=True)

# Save predictions to subtask_ directories
output_1.to_csv(f"./results/{trial_id}/subtask_1/pred_{lang}.csv", index=False)
output_2.to_csv(f"./results/{trial_id}/subtask_2/pred_{lang}.csv", index=False)
output_3.to_csv(f"./results/{trial_id}/subtask_3/pred_{lang}.csv", index=False)

print(f"Predictions saved for all 3 dev sets with Logical Gating applied.")

Predictions saved for all 3 dev sets with Logical Gating applied.


In [18]:
# print
output = output_3
for i in range(len(output)):
    print(",".join([str(output.iloc[i][col]) for col in output.columns]))


arb_67be47e5216d7bee41e17484e619f4e6,1,1,0,0,0,0
arb_272322e5b265e177613d685e5619e402,0,1,1,0,0,0
arb_d1ec38dd0ec5d7a4fe28ef8317fc96c1,0,1,0,1,0,0
arb_fad75310b17c124d98ebc514189ec033,1,1,1,1,1,0
arb_95caf70cec5bf00c94c35cf7af2a0ab5,1,1,0,1,0,0
arb_ac108c1ecf5071892c61abd253847b15,0,1,0,1,0,0
arb_adaaa6d482119e65ce337ee224674e70,1,1,0,1,1,0
arb_2794b08cac6cc9394a68c51cfc436243,0,1,0,0,0,0
arb_19dd96c989323c9e950a2c3ab9c285be,0,1,0,0,0,0
arb_f2bd638d9d9fc7a617130ff2b198b562,1,1,0,0,0,1
arb_f992bf7776b854d4f7f8475aebf80f49,1,1,0,1,1,0
arb_0b5ac70e86926f5e84cad94028864a37,0,1,0,0,0,0
arb_8ababf95f952e2425c2df1033192dac0,0,0,0,0,0,0
arb_06cd19aac6cc52e394a22d7d1dd58efc,0,0,0,0,0,0
arb_12eeeb8d2fa2d04be2ed9830d5f36ce9,0,0,0,0,0,0
arb_5bc23bacf9a161cd0f99719c70681a81,1,1,0,1,1,0
arb_9ee7c931ab1ecd655533042d8301f6bb,0,0,0,0,0,0
arb_bb7c40559f3a7ca1ecdd7dd7c136198f,1,1,0,1,0,0
arb_5d394c0cce56675e2fc36a0590b47ed7,0,0,0,0,0,0
arb_0704305e8313650e672563a2d073384f,0,0,0,0,0,0
arb_e56b759d14fd7050