In [8]:
import wandb
wandb.init(mode="disabled")

from transformers import AutoTokenizer, DataCollatorWithPadding, Trainer
from src.config import NUM_TYPES, NUM_MANIFESTATIONS, MODEL_NAMES
from src.data import load_data, prepare_datasets
from src.model import SharedMTLModel
from src.metrics import compute_metrics, make_compute_metrics_fn
from src.training import compute_pos_weights, get_training_args, get_early_stopping_callback
from src.predict import predict_dev_set
from src.logging_utils import log_experiment_results
from src.thresholds import find_optimal_thresholds

lang = "eng"
trial_id = "MTL_Thresholds_1"
model_name = MODEL_NAMES[-1]
print(f"model name is: {model_name}")

model name is: cardiffnlp/twitter-roberta-base-hate


In [2]:
train_1, train_2, train_3 = load_data(lang)
tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)
train_dataset, val_dataset = prepare_datasets(train_1, train_2, train_3, tokenizer)

config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [3]:
pos_weight_2 = compute_pos_weights(train_2, train_2.columns[2:])
pos_weight_3 = compute_pos_weights(train_3, train_3.columns[2:])
model = SharedMTLModel(model_name, NUM_TYPES, NUM_MANIFESTATIONS, pos_weight_2, pos_weight_3)

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
training_args = get_training_args(trial_id)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[get_early_stopping_callback()]
)

trainer.train()
eval_results = trainer.evaluate()
print(
    "Validation Results (before threshold optimization):",
    f"\nsubtask_1 f1_macro: {eval_results['eval_subtask_1/f1_macro']:.4f}",
    f"\nsubtask_2 f1_macro: {eval_results['eval_subtask_2/f1_macro']:.4f}",
    f"\nsubtask_3 f1_macro: {eval_results['eval_subtask_3/f1_macro']:.4f}",
)

Epoch,Training Loss,Validation Loss,Subtask 1/f1 Macro,Subtask 2/f1 Macro,Subtask 3/f1 Macro
1,0.88,0.702832,0.78417,0.287911,0.483007
2,0.6346,0.673045,0.806074,0.330054,0.483277
3,0.5182,0.691198,0.809569,0.362043,0.486907
4,0.4143,0.743279,0.821937,0.4151,0.513092
5,0.3578,0.786966,0.827781,0.449159,0.515605
6,0.329,0.85247,0.820208,0.417301,0.5149
7,0.291,0.924248,0.825195,0.48455,0.526183
8,0.2634,0.904617,0.832345,0.45974,0.519913
9,0.2423,0.949772,0.822327,0.491964,0.523328
10,0.2462,0.951907,0.824955,0.480425,0.521746


Validation Results (before threshold optimization): 
subtask_1 f1_macro: 0.8323 
subtask_2 f1_macro: 0.4597 
subtask_3 f1_macro: 0.5199


In [5]:
val_predictions = trainer.predict(val_dataset)
val_logits = val_predictions.predictions
if isinstance(val_logits, tuple):
    val_logits = val_logits[0]

val_labels = val_predictions.label_ids
thresholds = find_optimal_thresholds(val_logits, val_labels, NUM_TYPES, NUM_MANIFESTATIONS)

print("Optimal thresholds found:")
print(f"  Subtask 1: {thresholds['subtask_1']}")
print(f"  Subtask 2: {thresholds['subtask_2']}")
print(f"  Subtask 3: {thresholds['subtask_3']}")

eval_results_optimized = compute_metrics(val_predictions, thresholds=thresholds)
eval_results_optimized = {f"eval_{k}": v for k, v in eval_results_optimized.items()}
print(
    "\nValidation Results (after threshold optimization):",
    f"\nsubtask_1 f1_macro: {eval_results_optimized['eval_subtask_1/f1_macro']:.4f}",
    f"\nsubtask_2 f1_macro: {eval_results_optimized['eval_subtask_2/f1_macro']:.4f}",
    f"\nsubtask_3 f1_macro: {eval_results_optimized['eval_subtask_3/f1_macro']:.4f}",
)

Optimal thresholds found:
  Subtask 1: [0.20000000000000004]
  Subtask 2: [0.1, 0.7000000000000002, 0.7500000000000002, 0.8000000000000002, 0.9000000000000002]
  Subtask 3: [0.8000000000000002, 0.8000000000000002, 0.7500000000000002, 0.20000000000000004, 0.7500000000000002, 0.6000000000000002]

Validation Results (after threshold optimization): 
subtask_1 f1_macro: 0.8329 
subtask_2 f1_macro: 0.5174 
subtask_3 f1_macro: 0.5499


In [6]:
log_experiment_results(
    eval_results_optimized,
    trial_id,
    lang,
    model_name,
    training_args,
    NUM_TYPES,
    NUM_MANIFESTATIONS,
    thresholds=thresholds
)


âœ“ Experiment results logged to logs.json (trial_id: MTL_Thresholds)
  - subtask_1: eng
  - subtask_2: eng
  - subtask_3: eng


In [7]:
output_1, output_2, output_3 = predict_dev_set(trainer, tokenizer, lang, trial_id, thresholds=thresholds)
print("Predictions saved for all 3 dev sets with Logical Gating and optimized thresholds applied.")

Predictions saved for all 3 dev sets with Logical Gating and optimized thresholds applied.
