In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from transformers import AutoModel
from logging import log

In [2]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

# CONFIG
NUM_TYPES = 5
NUM_MANIFESTATIONS = 6
datasets_merge = True

In [3]:
train_df_1 = pd.read_csv("./dev_phase/subtask1/train/arb.csv")
train_df_2 = pd.read_csv("./dev_phase/subtask2/train/arb.csv")
train_df_3 = pd.read_csv("./dev_phase/subtask3/train/arb.csv")
dev_df = pd.read_csv("./dev_phase/subtask1/dev/arb.csv")

In [4]:
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

import torch

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
from tqdm.auto import tqdm

In [5]:
train_1 = pd.read_csv('./dev_phase/subtask1/train/eng.csv')
train_2 = pd.read_csv('./dev_phase/subtask2/train/eng.csv')
train_3 = pd.read_csv('./dev_phase/subtask3/train/eng.csv')

In [6]:
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.float)
    return item

In [None]:
from sklearn.model_selection import train_test_split
# Load the tokenizer
model_names = ['bert-base-uncased', "UBC-NLP/MARBERTv2"]
model_name = model_names[0]
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_datasets = []
val_datasets = []

# Prepare label columns separately for each task, fallback to the correct columns per train DataFrame
def get_label_columns(df):
    return [col for col in df.columns if col not in ['id', 'text']]

# Split indices once and reuse for all datasets to ensure same split
n_samples = len(train_1)
indices = np.arange(n_samples)
train_indices, val_indices = train_test_split(
    indices,
    test_size=0.2,
    random_state=42
)

if datasets_merge:
    # Merge all datasets on 'id'
    merged = train_1.merge(train_2, on=['id', 'text'], how='outer', suffixes=('_1', '_2'))
    # For the third, avoid duplicate columns of 'text', so drop redundant one, or merge only on id
    merged = merged.merge(train_3, on=['id', 'text'], how='outer', suffixes=('', '_3'))
    # Get label columns: all columns excluding 'id' and 'text'
    merged_label_columns = get_label_columns(merged)
    texts = merged['text'].tolist()
    labels = merged[merged_label_columns].values.tolist()
    texts_train = [texts[i] for i in train_indices]
    texts_val = [texts[i] for i in val_indices]
    labels_train = [labels[i] for i in train_indices]
    labels_val = [labels[i] for i in val_indices]
    train_dataset = PolarizationDataset(texts_train, labels_train, tokenizer)
    val_dataset = PolarizationDataset(texts_val, labels_val, tokenizer)
else:
    # Apply the same split to all three datasets
    for train in [train_1, train_2, train_3]:
        current_label_columns = get_label_columns(train)
        texts = train['text'].tolist()
        
        # Use the same indices for all datasets
        texts_train = [texts[i] for i in train_indices]
        texts_val = [texts[i] for i in val_indices]
        
        if current_label_columns:
            labels = train[current_label_columns].values.tolist()
            labels_train = [labels[i] for i in train_indices]
            labels_val = [labels[i] for i in val_indices]
        else:
            labels_train = [[] for _ in texts_train]
            labels_val = [[] for _ in texts_val]
        
        train_datasets.append(PolarizationDataset(texts_train, labels_train, tokenizer))
        val_datasets.append(PolarizationDataset(texts_val, labels_val, tokenizer))

In [None]:
class GatedMTLModel(nn.Module):
    def __init__(self, model_name, num_types, num_manifestations):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size

        self.num_types = num_types
        self.num_manifestations = num_manifestations

        self.head1 = nn.Linear(hidden_size, 1)
        self.head2 = nn.Linear(hidden_size, num_types)
        self.head3 = nn.Linear(hidden_size, num_manifestations)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        H = outputs.last_hidden_state[:, 0, :]

        logits1 = self.head1(H)
        gate = torch.sigmoid(logits1).detach()
        H_gated = H * gate

        logits2 = self.head2(H_gated)
        logits3 = self.head3(H_gated)

        logits = torch.cat([logits1, logits2, logits3], dim=-1)

        loss = None
        if labels is not None:
            labels = labels.float()
            loss_fct = nn.BCEWithLogitsLoss()

            y1_true = labels[:, :1]
            y2_true = labels[:, 1:1 + self.num_types]
            y3_true = labels[:, 1 + self.num_types:]

            loss1 = loss_fct(logits1, y1_true)
            loss2 = loss_fct(logits2, y2_true)
            loss3 = loss_fct(logits3, y3_true)

            loss = (loss1 + loss2 + loss3) / 3.0

        return {
            "loss": loss,
            "logits": logits,
            "polarization_logits": logits1,
            "types_logits": logits2,
            "manifestations_logits": logits3,
        }

In [16]:
model = GatedMTLModel(model_name, NUM_TYPES, NUM_MANIFESTATIONS)

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    if isinstance(logits, tuple):
        logits = logits[0]

    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= 0.5).astype(int)
    labels = labels.astype(int)

    y1_true = labels[:, 0]
    y1_pred = preds[:, 0]

    y2_true = labels[:, 1:1+NUM_TYPES]
    y2_pred = preds[:, 1:1+NUM_TYPES]

    y3_true = labels[:, 1+NUM_TYPES:]
    y3_pred = preds[:, 1+NUM_TYPES:]

    return {
        "subtask_1/accuracy": accuracy_score(y1_true, y1_pred),
        "subtask_1/f1_binary": f1_score(y1_true, y1_pred, average="binary", zero_division=0),
        "subtask_1/f1_macro": f1_score(y1_true, y1_pred, average="macro", zero_division=0),
        "subtask_1/f1_micro": f1_score(y1_true, y1_pred, average="micro", zero_division=0),

        "subtask_2/f1_macro": f1_score(y2_true, y2_pred, average="macro", zero_division=0),
        "subtask_2/f1_micro": f1_score(y2_true, y2_pred, average="micro", zero_division=0),

        "subtask_3/f1_macro": f1_score(y3_true, y3_pred, average="macro", zero_division=0),
        "subtask_3/f1_micro": f1_score(y3_true, y3_pred, average="micro", zero_division=0),
    }

# Define training arguments
training_args = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=3,
        learning_rate=2e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=100,
        disable_tqdm=False
    )

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
    data_collator=DataCollatorWithPadding(tokenizer) # Data collator for dynamic padding
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Validation Results:\nAccuracy: {eval_results['eval_accuracy']:.4f}\nPrecision: {eval_results['eval_precision']:.4f}\nRecall: {eval_results['eval_recall']:.4f}\nF1 (binary): {eval_results['eval_f1_binary']:.4f}\nF1 (macro): {eval_results['eval_f1_macro']:.4f}\nF1 (micro): {eval_results['eval_f1_micro']:.4f}")



Epoch,Training Loss,Validation Loss,Subtask 1/accuracy,Subtask 1/f1 Binary,Subtask 1/f1 Macro,Subtask 1/f1 Micro,Subtask 2/f1 Macro,Subtask 2/f1 Micro,Subtask 3/f1 Macro,Subtask 3/f1 Micro
1,No log,0.405967,0.364341,0.534091,0.267045,0.364341,0.006639,0.023055,0.0,0.0


