In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
train_df_1 = pd.read_csv("./dev_phase/subtask1/train/arb.csv")
train_df_2 = pd.read_csv("./dev_phase/subtask2/train/arb.csv")
train_df_3 = pd.read_csv("./dev_phase/subtask3/train/arb.csv")
dev_df = pd.read_csv("./dev_phase/subtask1/dev/arb.csv")

In [3]:
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

import torch

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
from tqdm.auto import tqdm

In [6]:
train_1 = pd.read_csv('./dev_phase/subtask1/train/eng.csv')
train_2 = pd.read_csv('./dev_phase/subtask2/train/eng.csv')
train_3 = pd.read_csv('./dev_phase/subtask3/train/eng.csv')

In [7]:
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.long)
    return item

In [27]:
from sklearn.model_selection import train_test_split
# Load the tokenizer
model_names = ['bert-base-uncased', "UBC-NLP/MARBERTv2"]
model_name = model_names[0]
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_datasets = []
val_datasets = []

# Prepare label columns separately for each task, fallback to the correct columns per train DataFrame
def get_label_columns(df):
    return [col for col in df.columns if col not in ['id', 'text']]

# Split indices once and reuse for all datasets to ensure same split
n_samples = len(train_1)
indices = np.arange(n_samples)
train_indices, val_indices = train_test_split(
    indices,
    test_size=0.2,
    random_state=42
)

# Apply the same split to all three datasets
for train in [train_1, train_2, train_3]:
    current_label_columns = get_label_columns(train)
    texts = train['text'].tolist()
    
    # Use the same indices for all datasets
    texts_train = [texts[i] for i in train_indices]
    texts_val = [texts[i] for i in val_indices]
    
    if current_label_columns:
        labels = train[current_label_columns].values.tolist()
        labels_train = [labels[i] for i in train_indices]
        labels_val = [labels[i] for i in val_indices]
    else:
        labels_train = [[] for _ in texts_train]
        labels_val = [[] for _ in texts_val]
    
    train_datasets.append(PolarizationDataset(texts_train, labels_train, tokenizer))
    val_datasets.append(PolarizationDataset(texts_val, labels_val, tokenizer))

In [40]:
len(train_datasets[0].labels[0]) + len(train_datasets[1].labels[0]) + len(train_datasets[2].labels[0]) 

12

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel

class GatedMTLModel(nn.Module):
    def __init__(self, model_name, num_types, num_manifestations):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        
        # Subtask 1 head (binary, polarization)
        self.head1 = nn.Linear(hidden_size, 1)
        
        # Heads for Subtask 2 (type) and Subtask 3 (manifestation)
        self.head2 = nn.Linear(hidden_size, num_types)
        self.head3 = nn.Linear(hidden_size, num_manifestations)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        # [CLS] representation (assumes huggingface transformers convention)
        H = outputs.last_hidden_state[:, 0, :]  # [batch, hidden]
        
        # Task 1 Binary Polarization head
        P_bin = self.sigmoid(self.head1(H))  # [batch, 1], probability in (0,1)
        
        # Soft gating: broadcast P_bin across hidden dim
        H_gated = H * P_bin
        
        # Downstream heads (multilabel)
        P_types = self.sigmoid(self.head2(H_gated))
        P_manifestations = self.sigmoid(self.head3(H_gated))
        
        return {
            "polarization": P_bin.squeeze(-1),
            "types": P_types,
            "manifestations": P_manifestations
        }

# Example instantiation (define num_types and num_manifestations according to your dataset)
num_types = len(train_datasets[1].labels[0])  # e.g., 6
num_manifestations = len(train_datasets[2].labels[0])  # e.g., 5
model = GatedMTLModel(model_name, num_types, num_manifestations)