### Testing 5 different transformer models with 3 seeds each
### Task: Argument Stance Classification (Support/Oppose)

In [1]:
# Libraries
import os
import numpy as np
import pandas as pd
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
import torch
from tqdm.auto import tqdm
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Google Colab or not
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    path = "/content/drive/MyDrive/multimodal-argmining"
    os.chdir(path)
    print(f"Loading data from Google Drive: {path}")
else:
    path = "C:/Users/diego/Desktop/Master Neuro/M2/Intership_NLP/multimodal-argmining"
    os.chdir(path)
    print(f"Loading data locally from: {path}")


# GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU ready:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("No GPU detecting, using CPU.")

Mounted at /content/drive
Loading data from Google Drive: /content/drive/MyDrive/multimodal-argmining
GPU ready: Tesla T4


In [2]:
# WE define our Models to tests and the seeds
MODELS = [
    "roberta-base",
    "microsoft/deberta-v3-base",
    "vinai/bertweet-base",
    "cardiffnlp/twitter-roberta-base",
    "bert-base-uncased",
    "microsoft/deberta-base"
]

#Seeds
SEEDS = [42, 123, 456]


print("\nModels to test:")
for i, model in enumerate(MODELS, 1):
    print(f"  {i}. {model}")



Models to test:
  1. roberta-base
  2. microsoft/deberta-v3-base
  3. vinai/bertweet-base
  4. cardiffnlp/twitter-roberta-base
  5. bert-base-uncased
  6. microsoft/deberta-base


In [3]:
#Load Dataset
train_path = f"{path}/data/train.csv"
dev_path   = f"{path}/data/dev.csv"
test_path  = f"{path}/data/test.csv"

df_train = pd.read_csv(train_path)
df_dev   = pd.read_csv(dev_path)
df_test  = pd.read_csv(test_path)


# Map labels to ints
label2id = {"oppose": 0, "support": 1}
for df in [df_train, df_dev, df_test]:
    df["label"] = df["stance"].map(label2id)

print(df_train["label"].value_counts())
df_train.head()



dataset_train = Dataset.from_pandas(df_train[["tweet_text", "label"]])
dataset_dev   = Dataset.from_pandas(df_dev[["tweet_text", "label"]])
dataset_test  = Dataset.from_pandas(df_test[["tweet_text", "label"]])


label
0    1095
1     719
Name: count, dtype: int64


In [4]:

# Tokenization Function for each model
def tokenize_dataset(dataset, tokenizer, max_length=105):

    def tokenize_batch(batch):
        return tokenizer(batch["tweet_text"],padding="max_length",truncation=True,max_length=max_length)

    tokenized = dataset.map(tokenize_batch, batched=True)

    tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    return tokenized


In [5]:
# We define metrics Function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    precision = precision_score(labels, preds, average="macro")
    recall = recall_score(labels, preds, average="macro")

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [6]:
# Training Function
def train_and_evaluate(model_name, seed, train_dataset, dev_dataset,test_dataset):

    # Set seed!
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    print(f"Training: {model_name} | Seed: {seed}")
    print(f"{'='*60}")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer loaded for {model_name}...")

    # Tokenize datasets with model tokenizer
    train_dataset_tok = tokenize_dataset(train_dataset, tokenizer, 105)
    dev_dataset_tok = tokenize_dataset(dev_dataset, tokenizer, 105)
    test_dataset_tok = tokenize_dataset(test_dataset, tokenizer, 105)
    print(f"Tokenization complete")


    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    print(f"Model Loaded: {model_name}.")


    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./temp_models/{model_name.replace('/', '_')}_seed{seed}",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        save_total_limit=1,
        report_to="none",
        logging_steps=10
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_tok,
        eval_dataset=dev_dataset_tok,
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # Train
    print(f"\n Starting training...")
    trainer.train()

    # Predictions on TEST set
    print(f"\n Getting predictions on TEST set...")
    predictions_output = trainer.predict(test_dataset_tok)
    y_pred = np.argmax(predictions_output.predictions, axis=1)
    y_true = predictions_output.label_ids



    # Evaluate
    print(f"\n Evaluating on test set...")
    eval_results = trainer.evaluate(test_dataset_tok)

    # Extract metrics
    results = {
        "model": model_name,
        "seed": seed,
        "accuracy": eval_results["eval_accuracy"],
        "precision": eval_results["eval_precision"],
        "recall": eval_results["eval_recall"],
        "f1": eval_results["eval_f1"],
        "loss": eval_results["eval_loss"],
        "y_true": y_true,
        "y_pred": y_pred
    }

    print(f"\n Results for {model_name} (seed {seed}):")
    print(f"   Accuracy:  {results['accuracy']:.4f}")
    print(f"   Precision: {results['precision']:.4f}")
    print(f"   Recall:    {results['recall']:.4f}")
    print(f"   F1-Score:  {results['f1']:.4f}")

    # Clean up to save memory
    del model
    del trainer
    del train_dataset_tok
    del dev_dataset_tok
    del test_dataset_tok

    torch.cuda.empty_cache() if torch.cuda.is_available() else None

    return results

In [7]:
# We run the Experiments
print("STARTING EXPERIMENT")
print("="*60)

all_results = []
total_experiments = len(MODELS) * len(SEEDS)
current_experiment = 0

for model_name in MODELS:
    print(f"# MODEL: {model_name}")

    for seed in SEEDS:
        current_experiment += 1
        print(f"\n[Experiment {current_experiment}/{total_experiments}]")
        results = train_and_evaluate(
            model_name=model_name,
            seed=seed,
            train_dataset=dataset_train,
            dev_dataset=dataset_dev,
            test_dataset=dataset_test)


        all_results.append(results)


print("EXPERIMENT COMPLETED")

STARTING EXPERIMENT
# MODEL: roberta-base

[Experiment 1/18]
Training: roberta-base | Seed: 42


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizer loaded for roberta-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5131,0.241209,0.895,0.881727,0.905952,0.867814
2,0.2963,0.193767,0.92,0.911495,0.924786,0.90206
3,0.1847,0.252822,0.92,0.913709,0.913709,0.913709
4,0.1254,0.289492,0.925,0.916738,0.932445,0.905997
5,0.0827,0.325184,0.915,0.90858,0.907443,0.909772



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for roberta-base (seed 42):
   Accuracy:  0.8467
   Precision: 0.8411
   Recall:    0.8557
   F1-Score:  0.8436

[Experiment 2/18]
Training: roberta-base | Seed: 123
Tokenizer loaded for roberta-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5146,0.296571,0.89,0.88135,0.88135,0.88135
2,0.3075,0.23893,0.9,0.89011,0.898889,0.8834
3,0.1901,0.286619,0.89,0.882667,0.878999,0.887175
4,0.125,0.324283,0.9,0.89011,0.898889,0.8834



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for roberta-base (seed 123):
   Accuracy:  0.8533
   Precision: 0.8486
   Recall:    0.8642
   F1-Score:  0.8507

[Experiment 3/18]
Training: roberta-base | Seed: 456
Tokenizer loaded for roberta-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.498,0.266399,0.89,0.878305,0.890598,0.869701
2,0.301,0.230381,0.9,0.893333,0.889558,0.897961
3,0.2052,0.204106,0.925,0.920211,0.915406,0.926383
4,0.1213,0.236679,0.92,0.913194,0.915821,0.910797
5,0.078,0.270412,0.93,0.924925,0.922667,0.927408



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for roberta-base (seed 456):
   Accuracy:  0.8500
   Precision: 0.8448
   Recall:    0.8600
   F1-Score:  0.8471
# MODEL: microsoft/deberta-v3-base

[Experiment 4/18]
Training: microsoft/deberta-v3-base | Seed: 42


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Tokenizer loaded for microsoft/deberta-v3-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-v3-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5815,0.417615,0.82,0.817795,0.827569,0.852443
2,0.2805,0.180017,0.915,0.909572,0.904897,0.915597
3,0.1503,0.259493,0.925,0.917308,0.928652,0.90891
4,0.0787,0.344626,0.92,0.911495,0.924786,0.90206
5,0.0368,0.355981,0.92,0.912088,0.921333,0.904972



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for microsoft/deberta-v3-base (seed 42):
   Accuracy:  0.8433
   Precision: 0.8426
   Recall:    0.8590
   F1-Score:  0.8414

[Experiment 5/18]
Training: microsoft/deberta-v3-base | Seed: 123
Tokenizer loaded for microsoft/deberta-v3-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-v3-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6084,0.354601,0.875,0.868942,0.862685,0.881189
2,0.3164,0.256704,0.885,0.880454,0.874188,0.8978
3,0.1537,0.247987,0.91,0.902923,0.902923,0.902923
4,0.0647,0.293517,0.925,0.91837,0.922527,0.914734
5,0.028,0.325916,0.925,0.919335,0.918168,0.920559



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for microsoft/deberta-v3-base (seed 123):
   Accuracy:  0.8167
   Precision: 0.8223
   Recall:    0.8370
   F1-Score:  0.8154

[Experiment 6/18]
Training: microsoft/deberta-v3-base | Seed: 456
Tokenizer loaded for microsoft/deberta-v3-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-v3-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6093,0.443624,0.86,0.836372,0.886696,0.816956
2,0.3145,0.234704,0.905,0.898933,0.894388,0.904811
3,0.2013,0.267938,0.9,0.891493,0.893984,0.889224
4,0.1089,0.39603,0.89,0.8799,0.885164,0.875526



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for microsoft/deberta-v3-base (seed 456):
   Accuracy:  0.8000
   Precision: 0.8150
   Recall:    0.8262
   F1-Score:  0.7994
# MODEL: vinai/bertweet-base

[Experiment 7/18]
Training: vinai/bertweet-base | Seed: 42


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Tokenizer loaded for vinai/bertweet-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: vinai/bertweet-base.

 Starting training...


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5263,0.288024,0.895,0.879859,0.916171,0.861989
2,0.3132,0.258935,0.885,0.868417,0.90377,0.851203
3,0.2255,0.260768,0.9,0.893333,0.889558,0.897961
4,0.144,0.240828,0.925,0.917851,0.925357,0.911822
5,0.0997,0.292297,0.915,0.910031,0.904167,0.918509



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for vinai/bertweet-base (seed 42):
   Accuracy:  0.8433
   Precision: 0.8366
   Recall:    0.8500
   F1-Score:  0.8397

[Experiment 8/18]
Training: vinai/bertweet-base | Seed: 123


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Tokenizer loaded for vinai/bertweet-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: vinai/bertweet-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5086,0.272583,0.9,0.88511,0.925392,0.865926
2,0.2846,0.209107,0.915,0.907486,0.911538,0.903948
3,0.1915,0.212584,0.92,0.912654,0.918354,0.907885
4,0.1462,0.239231,0.925,0.916738,0.932445,0.905997
5,0.0917,0.223217,0.905,0.897228,0.898438,0.896074



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for vinai/bertweet-base (seed 123):
   Accuracy:  0.8200
   Precision: 0.8152
   Recall:    0.8293
   F1-Score:  0.8167

[Experiment 9/18]
Training: vinai/bertweet-base | Seed: 456


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Tokenizer loaded for vinai/bertweet-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: vinai/bertweet-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5868,0.375542,0.855,0.82655,0.898221,0.804282
2,0.3662,0.268786,0.895,0.888862,0.883333,0.896937
3,0.2529,0.262744,0.895,0.887698,0.884762,0.891112
4,0.1711,0.26836,0.905,0.895945,0.903075,0.890249
5,0.1252,0.30139,0.885,0.876314,0.875268,0.877413



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for vinai/bertweet-base (seed 456):
   Accuracy:  0.8300
   Precision: 0.8268
   Recall:    0.8420
   F1-Score:  0.8274
# MODEL: cardiffnlp/twitter-roberta-base

[Experiment 10/18]
Training: cardiffnlp/twitter-roberta-base | Seed: 42


config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Tokenizer loaded for cardiffnlp/twitter-roberta-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: cardiffnlp/twitter-roberta-base.


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]


 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5201,0.261792,0.89,0.872552,0.919137,0.852227
2,0.2946,0.215997,0.915,0.905637,0.920956,0.895211
3,0.1765,0.226676,0.92,0.913709,0.913709,0.913709
4,0.1021,0.357476,0.92,0.914667,0.910675,0.919534
5,0.0619,0.37892,0.92,0.9142,0.912,0.916622



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for cardiffnlp/twitter-roberta-base (seed 42):
   Accuracy:  0.8100
   Precision: 0.8107
   Recall:    0.8255
   F1-Score:  0.8079

[Experiment 11/18]
Training: cardiffnlp/twitter-roberta-base | Seed: 123
Tokenizer loaded for cardiffnlp/twitter-roberta-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: cardiffnlp/twitter-roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5127,0.252427,0.885,0.867324,0.909564,0.84829
2,0.2933,0.217147,0.915,0.905637,0.920956,0.895211
3,0.1638,0.245016,0.92,0.913194,0.915821,0.910797
4,0.1022,0.33189,0.905,0.897228,0.898438,0.896074
5,0.0611,0.339983,0.91,0.902344,0.904902,0.900011



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for cardiffnlp/twitter-roberta-base (seed 123):
   Accuracy:  0.8100
   Precision: 0.8175
   Recall:    0.8315
   F1-Score:  0.8089

[Experiment 12/18]
Training: cardiffnlp/twitter-roberta-base | Seed: 456
Tokenizer loaded for cardiffnlp/twitter-roberta-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: cardiffnlp/twitter-roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4966,0.268947,0.855,0.829767,0.883333,0.810107
2,0.2924,0.190666,0.92,0.911495,0.924786,0.90206
3,0.17,0.325921,0.89,0.884405,0.877973,0.895912
4,0.1005,0.343055,0.895,0.889401,0.883113,0.899849



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for cardiffnlp/twitter-roberta-base (seed 456):
   Accuracy:  0.8300
   Precision: 0.8239
   Recall:    0.8375
   F1-Score:  0.8264
# MODEL: bert-base-uncased

[Experiment 13/18]
Training: bert-base-uncased | Seed: 42


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded for bert-base-uncased...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: bert-base-uncased.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.563,0.302549,0.875,0.861231,0.875,0.852066
2,0.3065,0.243877,0.88,0.870564,0.870564,0.870564
3,0.1805,0.270423,0.88,0.872666,0.867873,0.879301
4,0.0942,0.368043,0.895,0.884231,0.894731,0.876551
5,0.0528,0.385608,0.885,0.876314,0.875268,0.877413



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for bert-base-uncased (seed 42):
   Accuracy:  0.8200
   Precision: 0.8113
   Recall:    0.8204
   F1-Score:  0.8144

[Experiment 14/18]
Training: bert-base-uncased | Seed: 123
Tokenizer loaded for bert-base-uncased...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: bert-base-uncased.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5418,0.298579,0.875,0.861231,0.875,0.852066
2,0.3043,0.234351,0.905,0.895256,0.906038,0.887337
3,0.1714,0.287566,0.91,0.904,0.900116,0.908748
4,0.0905,0.322658,0.905,0.897825,0.896718,0.898986
5,0.0586,0.342731,0.905,0.897825,0.896718,0.898986



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for bert-base-uncased (seed 123):
   Accuracy:  0.8000
   Precision: 0.8031
   Recall:    0.8173
   F1-Score:  0.7982

[Experiment 15/18]
Training: bert-base-uncased | Seed: 456
Tokenizer loaded for bert-base-uncased...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: bert-base-uncased.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5484,0.321177,0.86,0.842891,0.862484,0.831518
2,0.324,0.264482,0.89,0.880642,0.883066,0.878438
3,0.1965,0.275164,0.895,0.885717,0.88956,0.882375
4,0.1093,0.30999,0.89,0.879121,0.887667,0.872614
5,0.0766,0.337665,0.89,0.882025,0.88,0.884263



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for bert-base-uncased (seed 456):
   Accuracy:  0.8200
   Precision: 0.8128
   Recall:    0.8248
   F1-Score:  0.8157
# MODEL: microsoft/deberta-base

[Experiment 16/18]
Training: microsoft/deberta-base | Seed: 42


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Tokenizer loaded for microsoft/deberta-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-base.


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5508,0.272265,0.91,0.899001,0.921453,0.885449
2,0.2954,0.252661,0.88,0.864315,0.890251,0.850178
3,0.1601,0.268048,0.91,0.902923,0.902923,0.902923
4,0.0886,0.370541,0.91,0.901736,0.907291,0.897098
5,0.0391,0.381729,0.91,0.901736,0.907291,0.897098



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for microsoft/deberta-base (seed 42):
   Accuracy:  0.8133
   Precision: 0.8148
   Recall:    0.8298
   F1-Score:  0.8114

[Experiment 17/18]
Training: microsoft/deberta-base | Seed: 123
Tokenizer loaded for microsoft/deberta-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5452,0.327906,0.855,0.828194,0.890351,0.807194
2,0.2872,0.252537,0.885,0.874833,0.878571,0.871589
3,0.1533,0.337167,0.89,0.882025,0.88,0.884263
4,0.0878,0.436653,0.895,0.881727,0.905952,0.867814
5,0.0404,0.450654,0.9,0.892137,0.892137,0.892137



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for microsoft/deberta-base (seed 123):
   Accuracy:  0.7867
   Precision: 0.7932
   Recall:    0.8063
   F1-Score:  0.7853

[Experiment 18/18]
Training: microsoft/deberta-base | Seed: 456
Tokenizer loaded for microsoft/deberta-base...


Map:   0%|          | 0/1814 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenization complete


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5141,0.256492,0.885,0.869463,0.898616,0.854115
2,0.2718,0.222866,0.905,0.892991,0.917857,0.8786
3,0.146,0.321819,0.885,0.877656,0.873371,0.883238
4,0.0709,0.467914,0.89,0.872552,0.919137,0.852227



 Getting predictions on TEST set...



 Evaluating on test set...



 Results for microsoft/deberta-base (seed 456):
   Accuracy:  0.8300
   Precision: 0.8248
   Recall:    0.8390
   F1-Score:  0.8268
EXPERIMENT COMPLETED


In [8]:
# Results
results_df = pd.DataFrame(all_results)

# Per model we calculate the mean and std on each metric
model_stats = results_df.groupby('model').agg({
    'accuracy': ['mean', 'std'],
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1': ['mean', 'std']
}).round(4)

# Format
model_stats.columns = ['_'.join(col).strip() for col in model_stats.columns.values]
model_stats = model_stats.reset_index()
model_stats.columns = [
    'Model',
    'Accuracy_Mean', 'Accuracy_Std',
    'Precision_Mean', 'Precision_Std',
    'Recall_Mean', 'Recall_Std',
    'F1_Mean', 'F1_Std'
]

# Sort by F1 score
model_stats = model_stats.sort_values('F1_Mean', ascending=False).reset_index(drop=True)
print(model_stats)


                             Model  Accuracy_Mean  Accuracy_Std  \
0                     roberta-base         0.8500        0.0033   
1              vinai/bertweet-base         0.8311        0.0117   
2        microsoft/deberta-v3-base         0.8200        0.0219   
3  cardiffnlp/twitter-roberta-base         0.8167        0.0115   
4                bert-base-uncased         0.8133        0.0115   
5           microsoft/deberta-base         0.8100        0.0219   

   Precision_Mean  Precision_Std  Recall_Mean  Recall_Std  F1_Mean  F1_Std  
0          0.8448         0.0038       0.8600      0.0042   0.8471  0.0035  
1          0.8262         0.0107       0.8404      0.0104   0.8279  0.0115  
2          0.8266         0.0143       0.8407      0.0167   0.8187  0.0212  
3          0.8174         0.0066       0.8315      0.0060   0.8144  0.0104  
4          0.8090         0.0052       0.8208      0.0038   0.8094  0.0097  
5          0.8109         0.0161       0.8250      0.0169   0.8078  

In [9]:
# Exporting Confusion Matrix

# Labels
labels = ["oppose", "support"]

# Path Output
os.makedirs(f"{path}/experiments/text/Performance/", exist_ok=True)
output_dir = f"{path}/experiments/text/Performance/"

for model_name, group in results_df.groupby("model"):

    # Concatenate all the y_true and y_pred per model
    y_true_combined = np.concatenate(group["y_true"].values)
    y_pred_combined = np.concatenate(group["y_pred"].values)

    # Confusion Matrix
    cm = confusion_matrix(y_true_combined, y_pred_combined, labels=[0, 1])

    #F1-Score
    f1_macro = f1_score(y_true_combined, y_pred_combined, average='macro')

    # Plot
    plt.figure(figsize=(6, 5))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        cbar=False,
        xticklabels=labels,
        yticklabels=labels
    )

    plt.title(f"Confusion Matrix - {model_name}\nF1 (Macro): {f1_macro:.4f}")
    plt.xlabel("Prediction")
    plt.ylabel("True Label")

    # Guardar
    safe_model_name = model_name.replace("/", "_").replace("\\", "_")
    save_path = os.path.join(output_dir, f"confusion_matrix_{safe_model_name}.jpg")
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()

    print(f"Confusion Matrix saved for {model_name}: {save_path}")


Confusion Matrix saved for bert-base-uncased: /content/drive/MyDrive/multimodal-argmining/experiments/text/Performance/confusion_matrix_bert-base-uncased.jpg
Confusion Matrix saved for cardiffnlp/twitter-roberta-base: /content/drive/MyDrive/multimodal-argmining/experiments/text/Performance/confusion_matrix_cardiffnlp_twitter-roberta-base.jpg
Confusion Matrix saved for microsoft/deberta-base: /content/drive/MyDrive/multimodal-argmining/experiments/text/Performance/confusion_matrix_microsoft_deberta-base.jpg
Confusion Matrix saved for microsoft/deberta-v3-base: /content/drive/MyDrive/multimodal-argmining/experiments/text/Performance/confusion_matrix_microsoft_deberta-v3-base.jpg
Confusion Matrix saved for roberta-base: /content/drive/MyDrive/multimodal-argmining/experiments/text/Performance/confusion_matrix_roberta-base.jpg
Confusion Matrix saved for vinai/bertweet-base: /content/drive/MyDrive/multimodal-argmining/experiments/text/Performance/confusion_matrix_vinai_bertweet-base.jpg


In [10]:

# Results to CSV
os.makedirs(f"{path}/experiments/text/Performance/", exist_ok=True)
output_file = f"{path}/experiments/text/Performance/model_comparison_results.csv"
model_stats.to_csv(output_file, index=False)
print(f"\nSummary results saved to: {output_file}")


Summary results saved to: /content/drive/MyDrive/multimodal-argmining/experiments/text/Performance/model_comparison_results.csv
