In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    AutoModel
)

from torch.utils.data import Dataset
from tqdm.auto import tqdm

from logs import log

# Config

In [2]:
import wandb

wandb.init(mode="disabled")
NUM_TYPES = 5
NUM_MANIFESTATIONS = 6
datasets_merge = True
device = "cuda"
lang = "arb"
arbert = "UBC-NLP/ARBERTv2"
marbert= "UBC-NLP/MARBERTv2"
aldi = "AMR-KELEG/Sentence-ALDi-50"

In [3]:
train_1 = pd.read_csv("./dev_phase/subtask1/train/" + lang + "_augmented.csv")
train_2 = pd.read_csv("./dev_phase/subtask2/train/" + lang + "_augmented.csv")
train_3 = pd.read_csv("./dev_phase/subtask3/train/" + lang + "_augmented.csv")
dev_df = pd.read_csv("./dev_phase/subtask1/dev/" + lang + ".csv")

# Get Aldi scores

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
texts = train_1["text"]

tokenizer = AutoTokenizer.from_pretrained(aldi)
model_aldi = AutoModelForSequenceClassification.from_pretrained(aldi)
model_aldi.to(device)

def compute_aldi_score(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs.to(device)
    outputs = model_aldi(**inputs)
    logits = outputs.logits
    return min(max(0, logits[0][0].item()), 1)

ALDi_scores = []

# Basic Setup for training

In [5]:
class SharedMTLModel(nn.Module):
    def __init__(self, model_name, num_types, num_manifestations, pos_weight_2=None, pos_weight_3=None):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        self.num_types = num_types
        self.num_manifestations = num_manifestations

        self.dropout = nn.Dropout(0.2)

        self.head1 = nn.Linear(hidden_size, 1)
        self.head2 = nn.Linear(hidden_size, num_types)
        self.head3 = nn.Linear(hidden_size, num_manifestations)

        self.register_buffer("pos_weight_2", pos_weight_2 if pos_weight_2 is not None else torch.tensor([]))
        self.register_buffer("pos_weight_3", pos_weight_3 if pos_weight_3 is not None else torch.tensor([]))

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        H = self.dropout(outputs.last_hidden_state[:, 0, :])

        logits1 = self.head1(H)
        logits2 = self.head2(H)
        logits3 = self.head3(H)
        logits = torch.cat([logits1, logits2, logits3], dim=-1)

        loss = None
        if labels is not None:
            labels = labels.float()
            y1_true = labels[:, :1]
            y2_true = labels[:, 1:1 + self.num_types]
            y3_true = labels[:, 1 + self.num_types:]

            device = logits1.device

            loss1 = nn.BCEWithLogitsLoss()(logits1, y1_true)

            if self.pos_weight_2.numel() > 0:
                loss2 = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight_2.to(device))(logits2, y2_true)
            else:
                loss2 = nn.BCEWithLogitsLoss()(logits2, y2_true)

            if self.pos_weight_3.numel() > 0:
                loss3 = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight_3.to(device))(logits3, y3_true)
            else:
                loss3 = nn.BCEWithLogitsLoss()(logits3, y3_true)

            loss = (loss1 + loss2 + loss3) / 3.0

        return {"loss": loss, "logits": logits}


In [6]:
def compute_pos_weights(df, label_cols):
    labels = df[label_cols].values
    pos = labels.sum(axis=0)
    neg = (labels == 0).sum(axis=0)
    weights = torch.tensor(neg / (pos + 1e-5), dtype=torch.float)
    return weights

pos_weight_2 = compute_pos_weights(train_2, train_2.columns[2:])
pos_weight_3 = compute_pos_weights(train_3, train_3.columns[2:])

In [7]:
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids

    if isinstance(logits, tuple):
        logits = logits[0]

    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= 0.5).astype(int)
    labels = labels.astype(int)

    # Extract Binary Classification (Subtask 1)
    y1_true = labels[:, 0]
    y1_pred = preds[:, 0]

    # --- LOGICAL GATING START ---
    # Create a mask from Subtask 1 predictions
    # Shape becomes (Batch_Size, 1) to broadcast over the other subtasks
    mask = y1_pred[:, None] 
    
    # Extract raw predictions for Subtask 2 & 3
    y2_pred_raw = preds[:, 1:1+NUM_TYPES]
    y3_pred_raw = preds[:, 1+NUM_TYPES:]
    
    # Apply the mask: If y1_pred is 0, force y2 and y3 to be 0
    y2_pred = y2_pred_raw * mask
    y3_pred = y3_pred_raw * mask
    # --- LOGICAL GATING END ---

    # Extract True Labels for Subtask 2 & 3
    y2_true = labels[:, 1:1+NUM_TYPES]
    y3_true = labels[:, 1+NUM_TYPES:]

    return {
        "subtask_1/f1_macro": f1_score(y1_true, y1_pred, average="macro", zero_division=0),
        "subtask_2/f1_macro": f1_score(y2_true, y2_pred, average="macro", zero_division=0),
        "subtask_3/f1_macro": f1_score(y3_true, y3_pred, average="macro", zero_division=0),    }


In [8]:
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.float)
    return item

In [9]:
trial_id = "ensemble1"

# Marbert

In [10]:
from sklearn.model_selection import train_test_split
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(marbert, force_download=True)


train_datasets = []
val_datasets = []

# Prepare label columns separately for each task, fallback to the correct columns per train DataFrame
def get_label_columns(df):
    return [col for col in df.columns if col not in ['id', 'text']]

# Split indices once and reuse for all datasets to ensure same split
n_samples = len(train_1)
indices = np.arange(n_samples)
train_indices, val_indices = train_test_split(
    indices,
    test_size=0.2,
    random_state=42
)

if datasets_merge:
    # Merge all datasets on 'id'
    merged = train_1.merge(train_2, on=['id', 'text'], how='outer', suffixes=('_1', '_2'))
    # For the third, avoid duplicate columns of 'text', so drop redundant one, or merge only on id
    merged = merged.merge(train_3, on=['id', 'text'], how='outer', suffixes=('', '_3'))
    # Get label columns: all columns excluding 'id' and 'text'
    merged_label_columns = get_label_columns(merged)
    texts = merged['text'].tolist()
    labels = merged[merged_label_columns].values.tolist()
    texts_train = [texts[i] for i in train_indices]
    texts_val = [texts[i] for i in val_indices]
    labels_train = [labels[i] for i in train_indices]
    labels_val = [labels[i] for i in val_indices]
    train_dataset = PolarizationDataset(texts_train, labels_train, tokenizer)
    val_dataset = PolarizationDataset(texts_val, labels_val, tokenizer)
else:
    # Apply the same split to all three datasets
    for train in [train_1, train_2, train_3]:
        current_label_columns = get_label_columns(train)
        texts = train['text'].tolist()
        
        # Use the same indices for all datasets
        texts_train = [texts[i] for i in train_indices]
        texts_val = [texts[i] for i in val_indices]
        
        if current_label_columns:
            labels = train[current_label_columns].values.tolist()
            labels_train = [labels[i] for i in train_indices]
            labels_val = [labels[i] for i in val_indices]
        else:
            labels_train = [[] for _ in texts_train]
            labels_val = [[] for _ in texts_val]
        
        train_datasets.append(PolarizationDataset(texts_train, labels_train, tokenizer))
        val_datasets.append(PolarizationDataset(texts_val, labels_val, tokenizer))

tokenizer_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

In [11]:
from transformers import EarlyStoppingCallback

model_marbert = SharedMTLModel(marbert, NUM_TYPES, NUM_MANIFESTATIONS, pos_weight_2, pos_weight_3)

training_args = TrainingArguments(
    output_dir=f"./results/{trial_id}",
    num_train_epochs=15,              # Increase max epochs, let EarlyStopping handle the cut
    learning_rate=2e-5,
    per_device_train_batch_size=32,   # 64 might be too stable? 32 adds some noise (good for regularization)
    per_device_eval_batch_size=16,
    weight_decay=0.01,                # Add Weight Decay!
    eval_strategy="epoch",
    save_strategy="epoch",            # Must save to load best
    load_best_model_at_end=True,      # Automatically load the best checkpoint
    metric_for_best_model="eval_subtask_1/f1_macro", # Optimize for the hardest metric or the main task
    save_total_limit=2,               # Don't fill disk
    logging_steps=50,
)

In [12]:
# Initialize the Trainer
trainer_marbert = Trainer(
    model=model_marbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Stop if no improvement for 3 epochs
)

# Train the model
trainer_marbert.train()

# Evaluate the model on the validation set
eval_results = trainer_marbert.evaluate()
print(
    "Validation Results:",
    f"\nsubtask_1 f1_macro: {eval_results['eval_subtask_1/f1_macro']:.4f}",
    f"\nsubtask_2 f1_macro: {eval_results['eval_subtask_2/f1_macro']:.4f}",
    f"\nsubtask_3 f1_macro: {eval_results['eval_subtask_3/f1_macro']:.4f}",
)

Epoch,Training Loss,Validation Loss,Subtask 1/f1 Macro,Subtask 2/f1 Macro,Subtask 3/f1 Macro
1,0.8899,0.647045,0.849196,0.49952,0.568576
2,0.6218,0.582612,0.849851,0.587067,0.599892
3,0.4815,0.551132,0.867371,0.598503,0.595112
4,0.4048,0.532292,0.876246,0.612037,0.608293
5,0.3474,0.545137,0.879459,0.637661,0.611619
6,0.3117,0.582846,0.866781,0.671259,0.639483
7,0.3012,0.533403,0.879723,0.670343,0.623384
8,0.262,0.532647,0.882641,0.6878,0.652021
9,0.2442,0.551411,0.875964,0.697894,0.655017
10,0.2257,0.538092,0.878888,0.701455,0.64948


Validation Results: 
subtask_1 f1_macro: 0.8866 
subtask_2 f1_macro: 0.7191 
subtask_3 f1_macro: 0.6532


# arbert

In [13]:
from sklearn.model_selection import train_test_split
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(arbert, force_download=True)


train_datasets = []
val_datasets = []

# Prepare label columns separately for each task, fallback to the correct columns per train DataFrame
def get_label_columns(df):
    return [col for col in df.columns if col not in ['id', 'text']]

# Split indices once and reuse for all datasets to ensure same split
n_samples = len(train_1)
indices = np.arange(n_samples)
train_indices, val_indices = train_test_split(
    indices,
    test_size=0.2,
    random_state=42
)

if datasets_merge:
    # Merge all datasets on 'id'
    merged = train_1.merge(train_2, on=['id', 'text'], how='outer', suffixes=('_1', '_2'))
    # For the third, avoid duplicate columns of 'text', so drop redundant one, or merge only on id
    merged = merged.merge(train_3, on=['id', 'text'], how='outer', suffixes=('', '_3'))
    # Get label columns: all columns excluding 'id' and 'text'
    merged_label_columns = get_label_columns(merged)
    texts = merged['text'].tolist()
    labels = merged[merged_label_columns].values.tolist()
    texts_train = [texts[i] for i in train_indices]
    texts_val = [texts[i] for i in val_indices]
    labels_train = [labels[i] for i in train_indices]
    labels_val = [labels[i] for i in val_indices]
    train_dataset = PolarizationDataset(texts_train, labels_train, tokenizer)
    val_dataset = PolarizationDataset(texts_val, labels_val, tokenizer)
else:
    # Apply the same split to all three datasets
    for train in [train_1, train_2, train_3]:
        current_label_columns = get_label_columns(train)
        texts = train['text'].tolist()
        
        # Use the same indices for all datasets
        texts_train = [texts[i] for i in train_indices]
        texts_val = [texts[i] for i in val_indices]
        
        if current_label_columns:
            labels = train[current_label_columns].values.tolist()
            labels_train = [labels[i] for i in train_indices]
            labels_val = [labels[i] for i in val_indices]
        else:
            labels_train = [[] for _ in texts_train]
            labels_val = [[] for _ in texts_val]
        
        train_datasets.append(PolarizationDataset(texts_train, labels_train, tokenizer))
        val_datasets.append(PolarizationDataset(texts_val, labels_val, tokenizer))

tokenizer_config.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

In [14]:
from transformers import EarlyStoppingCallback

model_arbert = SharedMTLModel(arbert, NUM_TYPES, NUM_MANIFESTATIONS, pos_weight_2, pos_weight_3)

training_args = TrainingArguments(
    output_dir=f"./results/{trial_id}",
    num_train_epochs=15,              # Increase max epochs, let EarlyStopping handle the cut
    learning_rate=2e-5,
    per_device_train_batch_size=32,   # 64 might be too stable? 32 adds some noise (good for regularization)
    per_device_eval_batch_size=16,
    weight_decay=0.01,                # Add Weight Decay!
    eval_strategy="epoch",
    save_strategy="epoch",            # Must save to load best
    load_best_model_at_end=True,      # Automatically load the best checkpoint
    metric_for_best_model="eval_subtask_1/f1_macro", # Optimize for the hardest metric or the main task
    save_total_limit=2,               # Don't fill disk
    logging_steps=50,
)

In [15]:
# Initialize the Trainer
trainer_arbert = Trainer(
    model=model_arbert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Stop if no improvement for 3 epochs
)

# Train the model
trainer_arbert.train()

# Evaluate the model on the validation set
eval_results = trainer_arbert.evaluate()
print(
    "Validation Results:",
    f"\nsubtask_1 f1_macro: {eval_results['eval_subtask_1/f1_macro']:.4f}",
    f"\nsubtask_2 f1_macro: {eval_results['eval_subtask_2/f1_macro']:.4f}",
    f"\nsubtask_3 f1_macro: {eval_results['eval_subtask_3/f1_macro']:.4f}",
)

Epoch,Training Loss,Validation Loss,Subtask 1/f1 Macro,Subtask 2/f1 Macro,Subtask 3/f1 Macro
1,0.774,0.589859,0.816127,0.541242,0.553141
2,0.5056,0.553247,0.832712,0.630337,0.60591
3,0.3677,0.535394,0.860885,0.673547,0.622607
4,0.286,0.580112,0.856889,0.672128,0.630402
5,0.2371,0.709485,0.866868,0.665656,0.63052
6,0.1878,0.689184,0.866085,0.696244,0.655375
7,0.1664,0.655294,0.866151,0.692484,0.654477
8,0.1341,0.736938,0.866992,0.713413,0.653647
9,0.11,0.700689,0.871189,0.71275,0.681518
10,0.0979,0.718414,0.864824,0.716863,0.674718


Validation Results: 
subtask_1 f1_macro: 0.8736 
subtask_2 f1_macro: 0.7345 
subtask_3 f1_macro: 0.6840


# inference

In [16]:
import os

# 1. Load Data
dev_1 = pd.read_csv(f"./dev_phase/subtask1/dev/{lang}.csv")
dev_2 = pd.read_csv(f"./dev_phase/subtask2/dev/{lang}.csv")
dev_3 = pd.read_csv(f"./dev_phase/subtask3/dev/{lang}.csv")

ALDi_scores = []
texts = dev_1['text'].tolist()
for text in tqdm(texts):
    ALDi_scores.append(compute_aldi_score(text))

  0%|          | 0/169 [00:00<?, ?it/s]

In [18]:
dev_texts = dev_1['text'].tolist()
# Dummy labels for prediction
dev_dataset = PolarizationDataset(dev_texts, [[0]*12]*len(dev_texts), tokenizer)

# 2. Get Predictions from BOTH models
print("Predicting with MARBERT...")
out_mar = trainer_marbert.predict(dev_dataset)
logits_mar = out_mar.predictions[0] if isinstance(out_mar.predictions, tuple) else out_mar.predictions

print("Predicting with ARBERT...")
out_arb = trainer_arbert.predict(dev_dataset)
logits_arb = out_arb.predictions[0] if isinstance(out_arb.predictions, tuple) else out_arb.predictions

# 3. Convert Logits to Probabilities (Sigmoid)
# We must mix probabilities, not logits, because the scales might differ
probs_mar = 1 / (1 + np.exp(-logits_mar))
probs_arb = 1 / (1 + np.exp(-logits_arb))

# 4. Prepare ALDi Weights with "Safety Clamp"
# Ensure aldi_scores is a numpy array of shape (N, 1)
# aldi_scores must be the same length and order as dev_texts
weights = np.array(ALDi_scores).reshape(-1, 1)

# CRITICAL STEP: The "Safe Mixture"
# If ALDi is 0 (MSA), we force a 50/50 split instead of 100% ARBERT.
# If ALDi is 1 (Dialect), we trust MARBERT 100%.
# This protects against ARBERT's overfitting on the MSA samples.
weights = np.maximum(weights, 0.5) 

# 5. Weighted Combination
# Formula: (MARBERT * Weight) + (ARBERT * (1 - Weight))
final_probs = (probs_mar * weights) + (probs_arb * (1 - weights))

# 6. Thresholding (Standard 0.5)
preds = (final_probs >= 0.5).astype(int)

# --- (Rest of your code remains exactly the same) ---

# Extract predictions for Subtask 1
polarization_preds = preds[:, 0]

# --- LOGICAL GATING START ---
# Create mask based on Subtask 1 (N, 1)
mask = polarization_preds[:, None]

# Apply mask to Subtasks 2 and 3
types_preds = preds[:, 1:1+NUM_TYPES] * mask
manifestations_preds = preds[:, 1+NUM_TYPES:] * mask
# --- LOGICAL GATING END ---

# Create output DataFrames
output_1 = dev_1[['id', 'text']].copy()
output_1['polarization'] = polarization_preds

output_2 = dev_2[['id', 'text']].copy()
type_cols = [col for col in dev_2.columns if col not in ['id', 'text']]
for i, col in enumerate(type_cols):
    output_2[col] = types_preds[:, i]

output_3 = dev_3[['id', 'text']].copy()
manifest_cols = [col for col in dev_3.columns if col not in ['id', 'text']]
for i, col in enumerate(manifest_cols):
    output_3[col] = manifestations_preds[:, i]

# Drop the 'text' column before saving
output_1 = output_1.drop(columns=['text'])
output_2 = output_2.drop(columns=['text'])
output_3 = output_3.drop(columns=['text'])

# Create dir under results with trial_id
os.makedirs(f"./results/{trial_id}", exist_ok=True)
os.makedirs(f"./results/{trial_id}/subtask_1", exist_ok=True)
os.makedirs(f"./results/{trial_id}/subtask_2", exist_ok=True)
os.makedirs(f"./results/{trial_id}/subtask_3", exist_ok=True)

# Save predictions
output_1.to_csv(f"./results/{trial_id}/subtask_1/pred_{lang}.csv", index=False)
output_2.to_csv(f"./results/{trial_id}/subtask_2/pred_{lang}.csv", index=False)
output_3.to_csv(f"./results/{trial_id}/subtask_3/pred_{lang}.csv", index=False)

print(f"Predictions saved using Weighted Ensemble (Safe Mix) + Logical Gating.")

Predicting with MARBERT...


Predicting with ARBERT...


Predictions saved using Weighted Ensemble (Safe Mix) + Logical Gating.


## Single inference

In [20]:
import os
trial_id = "noensemble1_arbert_only"
# Load dev 1 and predict all 3 dev sets
dev_1 = pd.read_csv(f"./dev_phase/subtask1/dev/{lang}.csv")
dev_2 = pd.read_csv(f"./dev_phase/subtask2/dev/{lang}.csv")
dev_3 = pd.read_csv(f"./dev_phase/subtask3/dev/{lang}.csv")

# Create dataset from dev 1 texts (all dev sets have same texts)
dev_texts = dev_1['text'].tolist()
# Dummy labels for prediction
dev_dataset = PolarizationDataset(dev_texts, [[0]*12]*len(dev_texts), tokenizer)

# Predict
predictions = trainer_arbert.predict(dev_dataset)
logits = predictions.predictions
if isinstance(logits, tuple):
    logits = logits[0]
probs = 1 / (1 + np.exp(-logits))
preds = (probs >= 0.5).astype(int)

# Extract predictions for Subtask 1
polarization_preds = preds[:, 0]

# --- LOGICAL GATING START ---
# Create mask based on Subtask 1 (N, 1)
mask = polarization_preds[:, None]

# Apply mask to Subtasks 2 and 3
# If polarization is 0, these predictions become 0 regardless of model output
types_preds = preds[:, 1:1+NUM_TYPES] * mask
manifestations_preds = preds[:, 1+NUM_TYPES:] * mask
# --- LOGICAL GATING END ---

# Create output DataFrames
output_1 = dev_1[['id', 'text']].copy()
output_1['polarization'] = polarization_preds

output_2 = dev_2[['id', 'text']].copy()
type_cols = [col for col in dev_2.columns if col not in ['id', 'text']]
for i, col in enumerate(type_cols):
    output_2[col] = types_preds[:, i]

output_3 = dev_3[['id', 'text']].copy()
manifest_cols = [col for col in dev_3.columns if col not in ['id', 'text']]
for i, col in enumerate(manifest_cols):
    output_3[col] = manifestations_preds[:, i]

# Drop the 'text' column before saving
output_1 = output_1.drop(columns=['text'])
output_2 = output_2.drop(columns=['text'])
output_3 = output_3.drop(columns=['text'])

# Create dir under results with trial_id
os.makedirs(f"./results/{trial_id}", exist_ok=True)
os.makedirs(f"./results/{trial_id}/subtask_1", exist_ok=True)
os.makedirs(f"./results/{trial_id}/subtask_2", exist_ok=True)
os.makedirs(f"./results/{trial_id}/subtask_3", exist_ok=True)

# Save predictions to subtask_ directories
output_1.to_csv(f"./results/{trial_id}/subtask_1/pred_{lang}.csv", index=False)
output_2.to_csv(f"./results/{trial_id}/subtask_2/pred_{lang}.csv", index=False)
output_3.to_csv(f"./results/{trial_id}/subtask_3/pred_{lang}.csv", index=False)

print(f"Predictions saved for all 3 dev sets with Logical Gating applied.")

Predictions saved for all 3 dev sets with Logical Gating applied.
