### Testing 5 different transformer models with 3 seeds each
### Task: Argument Stance Classification (Support/Oppose)

In [8]:
# Libraries
import os
import numpy as np
import pandas as pd
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Google Colab or not
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    path = "/content/drive/MyDrive/multimodal-argmining"
    os.chdir(path)
    print(f"Loading data from Google Drive: {path}")
else:
    path = "C:/Users/diego/Desktop/Master Neuro/M2/Intership_NLP/multimodal-argmining"
    os.chdir(path)
    print(f"Loading data locally from: {path}")


# GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU ready:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("No GPU detecting, using CPU.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading data from Google Drive: /content/drive/MyDrive/multimodal-argmining
GPU ready: Tesla T4


In [9]:
# WE define our Models to tests and the seeds
MODELS = [
    "roberta-base",
    "microsoft/deberta-v3-base",
    "cardiffnlp/twitter-roberta-base",
    "bert-base-uncased",
    "microsoft/deberta-base",
    "ddore14/RooseBERT-scr-uncased"
]

#Seeds
SEEDS = [42, 123, 456]


print("\nModels to test:")
for i, model in enumerate(MODELS, 1):
    print(f"  {i}. {model}")



Models to test:
  1. roberta-base
  2. microsoft/deberta-v3-base
  3. cardiffnlp/twitter-roberta-base
  4. bert-base-uncased
  5. microsoft/deberta-base
  6. ddore14/RooseBERT-scr-uncased


In [10]:
#Load Dataset
train_path = f"{path}/data/gun_control_train.csv"
dev_path   = f"{path}/data/gun_control_dev.csv"
test_path  = f"{path}/data/gun_control_test.csv"

df_train = pd.read_csv(train_path)
df_dev   = pd.read_csv(dev_path)
df_test  = pd.read_csv(test_path)


# Map labels to ints
label2id = {"oppose": 0, "support": 1}
for df in [df_train, df_dev, df_test]:
    df["label"] = df["stance"].map(label2id)

print(df_train["label"].value_counts())
df_train.head()



dataset_train = Dataset.from_pandas(df_train[["tweet_text", "label"]])
dataset_dev   = Dataset.from_pandas(df_dev[["tweet_text", "label"]])
dataset_test  = Dataset.from_pandas(df_test[["tweet_text", "label"]])


label
1    475
0    448
Name: count, dtype: int64


In [11]:

# Tokenization Function for each model
def tokenize_dataset(dataset, tokenizer, max_length=128):

    def tokenize_batch(batch):
        return tokenizer(batch["tweet_text"],padding="max_length",truncation=True,max_length=max_length)

    tokenized = dataset.map(tokenize_batch, batched=True)

    tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    return tokenized


In [12]:
# We define metrics Function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")

    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [13]:
# Training Function
def train_and_evaluate(model_name, seed, train_dataset, dev_dataset,test_dataset):

    # Set seed!
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    print(f"Training: {model_name} | Seed: {seed}")
    print(f"{'='*60}")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer loaded for {model_name}...")

    # Tokenize datasets with model tokenizer
    train_dataset_tok = tokenize_dataset(train_dataset, tokenizer, 128)
    dev_dataset_tok = tokenize_dataset(dev_dataset, tokenizer, 128)
    test_dataset_tok = tokenize_dataset(test_dataset, tokenizer, 128)
    print(f"Tokenization complete")


    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    print(f"Model Loaded: {model_name}.")


    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./temp_models/{model_name.replace('/', '_')}_seed{seed}",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        save_total_limit=1,
        report_to="none",
        logging_steps=10
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_tok,
        eval_dataset=dev_dataset_tok,
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # Train
    print(f"\n Starting training...")
    trainer.train()


    # Evaluate
    print(f"\n Evaluating on test set...")
    eval_results = trainer.evaluate(test_dataset_tok)

    # Extract metrics
    results = {
        "model": model_name,
        "seed": seed,
        "accuracy": eval_results["eval_accuracy"],
        "precision": eval_results["eval_precision"],
        "recall": eval_results["eval_recall"],
        "f1": eval_results["eval_f1"],
        "loss": eval_results["eval_loss"]
    }

    print(f"\n Results for {model_name} (seed {seed}):")
    print(f"   Accuracy:  {results['accuracy']:.4f}")
    print(f"   Precision: {results['precision']:.4f}")
    print(f"   Recall:    {results['recall']:.4f}")
    print(f"   F1-Score:  {results['f1']:.4f}")

    # Clean up to save memory
    del model
    del trainer
    del train_dataset_tok
    del dev_dataset_tok
    del test_dataset_tok

    torch.cuda.empty_cache() if torch.cuda.is_available() else None

    return results

In [14]:
# We run the Experiments
print("STARTING EXPERIMENT")
print("="*60)

all_results = []
total_experiments = len(MODELS) * len(SEEDS)
current_experiment = 0

for model_name in MODELS:
    print(f"# MODEL: {model_name}")

    for seed in SEEDS:
        current_experiment += 1
        print(f"\n[Experiment {current_experiment}/{total_experiments}]")
        results = train_and_evaluate(
            model_name=model_name,
            seed=seed,
            train_dataset=dataset_train,
            dev_dataset=dataset_dev,
            test_dataset=dataset_test)


        all_results.append(results)


print("EXPERIMENT COMPLETED")

STARTING EXPERIMENT
# MODEL: roberta-base

[Experiment 1/18]
Training: roberta-base | Seed: 42
Tokenizer loaded for roberta-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5157,0.352581,0.85,0.848417,0.886885,0.85
2,0.2707,0.297606,0.9,0.8998,0.917857,0.9
3,0.1947,0.289663,0.87,0.869882,0.869939,0.87
4,0.1163,0.420779,0.9,0.8998,0.917857,0.9



 Evaluating on test set...



 Results for roberta-base (seed 42):
   Accuracy:  0.8267
   Precision: 0.8280
   Recall:    0.8267
   F1-Score:  0.8249

[Experiment 2/18]
Training: roberta-base | Seed: 123
Tokenizer loaded for roberta-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5503,0.366597,0.86,0.85882,0.892667,0.86
2,0.28,0.240224,0.9,0.8998,0.917857,0.9
3,0.1972,0.466717,0.85,0.847637,0.859846,0.85
4,0.1282,0.376212,0.9,0.8998,0.917857,0.9



 Evaluating on test set...



 Results for roberta-base (seed 123):
   Accuracy:  0.8267
   Precision: 0.8296
   Recall:    0.8267
   F1-Score:  0.8243

[Experiment 3/18]
Training: roberta-base | Seed: 456
Tokenizer loaded for roberta-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5224,0.445345,0.83,0.827378,0.875873,0.83
2,0.308,0.239722,0.91,0.909919,0.924727,0.91
3,0.2149,0.313973,0.89,0.8899,0.88998,0.89
4,0.1267,0.371901,0.9,0.9,0.911594,0.9



 Evaluating on test set...



 Results for roberta-base (seed 456):
   Accuracy:  0.8200
   Precision: 0.8218
   Recall:    0.8200
   F1-Score:  0.8178
# MODEL: microsoft/deberta-v3-base

[Experiment 4/18]
Training: microsoft/deberta-v3-base | Seed: 42


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Tokenizer loaded for microsoft/deberta-v3-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-v3-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5641,0.305897,0.88,0.880144,0.886635,0.88
2,0.2295,0.333373,0.9,0.90016,0.9032,0.9
3,0.1214,0.377459,0.9,0.90012,0.900962,0.9
4,0.0521,0.501629,0.91,0.910135,0.914822,0.91
5,0.0286,0.504626,0.91,0.910135,0.914822,0.91



 Evaluating on test set...



 Results for microsoft/deberta-v3-base (seed 42):
   Accuracy:  0.8667
   Precision: 0.8711
   Recall:    0.8667
   F1-Score:  0.8648

[Experiment 5/18]
Training: microsoft/deberta-v3-base | Seed: 123
Tokenizer loaded for microsoft/deberta-v3-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-v3-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5809,0.27587,0.89,0.890077,0.890333,0.89
2,0.2679,0.277236,0.91,0.910135,0.914822,0.91
3,0.1526,0.411018,0.88,0.879416,0.881708,0.88
4,0.0794,0.393104,0.91,0.910135,0.911909,0.91
5,0.0356,0.410174,0.91,0.910135,0.911909,0.91



 Evaluating on test set...



 Results for microsoft/deberta-v3-base (seed 123):
   Accuracy:  0.8267
   Precision: 0.8421
   Recall:    0.8267
   F1-Score:  0.8210

[Experiment 6/18]
Training: microsoft/deberta-v3-base | Seed: 456
Tokenizer loaded for microsoft/deberta-v3-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-v3-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5893,0.3606,0.86,0.859327,0.884039,0.86
2,0.2438,0.281526,0.91,0.910063,0.919077,0.91
3,0.1359,0.37554,0.91,0.910063,0.919077,0.91
4,0.0494,0.431547,0.91,0.910135,0.914822,0.91
5,0.0188,0.450777,0.91,0.910135,0.914822,0.91



 Evaluating on test set...



 Results for microsoft/deberta-v3-base (seed 456):
   Accuracy:  0.8467
   Precision: 0.8515
   Recall:    0.8467
   F1-Score:  0.8442
# MODEL: cardiffnlp/twitter-roberta-base

[Experiment 7/18]
Training: cardiffnlp/twitter-roberta-base | Seed: 42


config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Tokenizer loaded for cardiffnlp/twitter-roberta-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: cardiffnlp/twitter-roberta-base.

 Starting training...


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4986,0.271279,0.88,0.87976,0.89737,0.88
2,0.2409,0.259728,0.87,0.869883,0.884,0.87
3,0.1636,0.34224,0.88,0.878972,0.8845,0.88



 Evaluating on test set...



 Results for cardiffnlp/twitter-roberta-base (seed 42):
   Accuracy:  0.8133
   Precision: 0.8158
   Recall:    0.8133
   F1-Score:  0.8107

[Experiment 8/18]
Training: cardiffnlp/twitter-roberta-base | Seed: 123
Tokenizer loaded for cardiffnlp/twitter-roberta-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: cardiffnlp/twitter-roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5382,0.263998,0.87,0.869571,0.8906,0.87
2,0.2849,0.21572,0.9,0.9,0.911594,0.9
3,0.1755,0.324294,0.9,0.899143,0.905,0.9
4,0.0953,0.302276,0.91,0.910063,0.919077,0.91
5,0.061,0.296767,0.92,0.920096,0.926827,0.92



 Evaluating on test set...



 Results for cardiffnlp/twitter-roberta-base (seed 123):
   Accuracy:  0.8333
   Precision: 0.8433
   Recall:    0.8333
   F1-Score:  0.8292

[Experiment 9/18]
Training: cardiffnlp/twitter-roberta-base | Seed: 456
Tokenizer loaded for cardiffnlp/twitter-roberta-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: cardiffnlp/twitter-roberta-base.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5145,0.305047,0.87,0.869153,0.898644,0.87
2,0.2965,0.228092,0.88,0.88,0.891337,0.88
3,0.2032,0.37775,0.86,0.858161,0.867878,0.86
4,0.1322,0.318451,0.89,0.889901,0.904364,0.89
5,0.1,0.290409,0.91,0.910135,0.914822,0.91



 Evaluating on test set...



 Results for cardiffnlp/twitter-roberta-base (seed 456):
   Accuracy:  0.8133
   Precision: 0.8235
   Recall:    0.8133
   F1-Score:  0.8083
# MODEL: bert-base-uncased

[Experiment 10/18]
Training: bert-base-uncased | Seed: 42


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded for bert-base-uncased...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: bert-base-uncased.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5411,0.355135,0.87,0.869153,0.898644,0.87
2,0.2613,0.2876,0.87,0.870195,0.874742,0.87
3,0.1559,0.329994,0.9,0.9,0.911594,0.9
4,0.0798,0.411136,0.85,0.850106,0.850381,0.85
5,0.0452,0.432647,0.87,0.870195,0.871957,0.87



 Evaluating on test set...



 Results for bert-base-uncased (seed 42):
   Accuracy:  0.7867
   Precision: 0.7869
   Recall:    0.7867
   F1-Score:  0.7845

[Experiment 11/18]
Training: bert-base-uncased | Seed: 123
Tokenizer loaded for bert-base-uncased...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: bert-base-uncased.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5293,0.388123,0.82,0.816727,0.870625,0.82
2,0.3315,0.251916,0.9,0.9,0.911594,0.9
3,0.2104,0.329699,0.87,0.869566,0.870722,0.87
4,0.1095,0.359142,0.87,0.869882,0.869939,0.87



 Evaluating on test set...



 Results for bert-base-uncased (seed 123):
   Accuracy:  0.8067
   Precision: 0.8069
   Recall:    0.8067
   F1-Score:  0.8050

[Experiment 12/18]
Training: bert-base-uncased | Seed: 456
Tokenizer loaded for bert-base-uncased...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: bert-base-uncased.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5768,0.452977,0.78,0.773089,0.851176,0.78
2,0.3334,0.25908,0.88,0.880145,0.880994,0.88
3,0.2036,0.280978,0.89,0.889633,0.890894,0.89
4,0.1003,0.298791,0.89,0.8899,0.88998,0.89
5,0.0574,0.317296,0.89,0.890165,0.891933,0.89



 Evaluating on test set...



 Results for bert-base-uncased (seed 456):
   Accuracy:  0.8133
   Precision: 0.8178
   Recall:    0.8133
   F1-Score:  0.8100
# MODEL: microsoft/deberta-base

[Experiment 13/18]
Training: microsoft/deberta-base | Seed: 42


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Tokenizer loaded for microsoft/deberta-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6993,0.685871,0.54,0.378701,0.2916,0.54
2,0.5021,0.349275,0.83,0.827378,0.875873,0.83
3,0.2325,0.218044,0.9,0.90012,0.900962,0.9
4,0.1299,0.237668,0.9,0.90016,0.9032,0.9
5,0.0647,0.291755,0.89,0.890165,0.894782,0.89


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]


 Evaluating on test set...



 Results for microsoft/deberta-base (seed 42):
   Accuracy:  0.8200
   Precision: 0.8326
   Recall:    0.8200
   F1-Score:  0.8146

[Experiment 14/18]
Training: microsoft/deberta-base | Seed: 123
Tokenizer loaded for microsoft/deberta-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6713,0.594926,0.78,0.778145,0.808,0.78
2,0.4763,0.278316,0.88,0.87976,0.89737,0.88
3,0.2813,0.257913,0.9,0.899798,0.900325,0.9
4,0.1315,0.233327,0.9,0.90012,0.906731,0.9
5,0.0704,0.279381,0.91,0.910135,0.911909,0.91



 Evaluating on test set...



 Results for microsoft/deberta-base (seed 123):
   Accuracy:  0.8200
   Precision: 0.8370
   Recall:    0.8200
   F1-Score:  0.8136

[Experiment 15/18]
Training: microsoft/deberta-base | Seed: 456
Tokenizer loaded for microsoft/deberta-base...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: microsoft/deberta-base.


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6731,0.721164,0.65,0.614324,0.801235,0.65
2,0.4725,0.321875,0.86,0.85972,0.876883,0.86
3,0.2583,0.345103,0.87,0.870091,0.870357,0.87
4,0.1348,0.342248,0.88,0.880192,0.8832,0.88
5,0.0713,0.41226,0.87,0.870195,0.871957,0.87



 Evaluating on test set...



 Results for microsoft/deberta-base (seed 456):
   Accuracy:  0.8267
   Precision: 0.8345
   Recall:    0.8267
   F1-Score:  0.8228
# MODEL: ddore14/RooseBERT-scr-uncased

[Experiment 16/18]
Training: ddore14/RooseBERT-scr-uncased | Seed: 42


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Tokenizer loaded for ddore14/RooseBERT-scr-uncased...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ddore14/RooseBERT-scr-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: ddore14/RooseBERT-scr-uncased.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5104,0.405876,0.83,0.827378,0.875873,0.83
2,0.2758,0.242527,0.88,0.880192,0.8832,0.88
3,0.1695,0.263871,0.9,0.9,0.911594,0.9
4,0.0919,0.260221,0.9,0.90016,0.9032,0.9
5,0.0484,0.341975,0.91,0.910063,0.919077,0.91



 Evaluating on test set...



 Results for ddore14/RooseBERT-scr-uncased (seed 42):
   Accuracy:  0.7933
   Precision: 0.8073
   Recall:    0.7933
   F1-Score:  0.7860

[Experiment 17/18]
Training: ddore14/RooseBERT-scr-uncased | Seed: 123
Tokenizer loaded for ddore14/RooseBERT-scr-uncased...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ddore14/RooseBERT-scr-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: ddore14/RooseBERT-scr-uncased.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5333,0.468999,0.8,0.79513,0.860606,0.8
2,0.2904,0.259455,0.86,0.86,0.871079,0.86
3,0.1688,0.269217,0.89,0.890077,0.898908,0.89
4,0.0864,0.289562,0.89,0.889901,0.904364,0.89
5,0.0423,0.339387,0.89,0.889901,0.904364,0.89



 Evaluating on test set...



 Results for ddore14/RooseBERT-scr-uncased (seed 123):
   Accuracy:  0.8067
   Precision: 0.8181
   Recall:    0.8067
   F1-Score:  0.8009

[Experiment 18/18]
Training: ddore14/RooseBERT-scr-uncased | Seed: 456
Tokenizer loaded for ddore14/RooseBERT-scr-uncased...


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenization complete


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ddore14/RooseBERT-scr-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Loaded: ddore14/RooseBERT-scr-uncased.

 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5461,0.397963,0.84,0.837939,0.88129,0.84
2,0.2821,0.278883,0.86,0.860169,0.861026,0.86
3,0.1792,0.286029,0.88,0.880145,0.880994,0.88
4,0.0942,0.336509,0.88,0.880145,0.880994,0.88
5,0.064,0.350108,0.88,0.880192,0.8832,0.88



 Evaluating on test set...



 Results for ddore14/RooseBERT-scr-uncased (seed 456):
   Accuracy:  0.7867
   Precision: 0.7979
   Recall:    0.7867
   F1-Score:  0.7797
EXPERIMENT COMPLETED


In [15]:
# Results
results_df = pd.DataFrame(all_results)

# Per model we calculate the mean and std on each metric
model_stats = results_df.groupby('model').agg({
    'accuracy': ['mean', 'std'],
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1': ['mean', 'std']
}).round(4)

# Format
model_stats.columns = ['_'.join(col).strip() for col in model_stats.columns.values]
model_stats = model_stats.reset_index()
model_stats.columns = [
    'Model',
    'Accuracy_Mean', 'Accuracy_Std',
    'Precision_Mean', 'Precision_Std',
    'Recall_Mean', 'Recall_Std',
    'F1_Mean', 'F1_Std'
]

# Sort by F1 score
model_stats = model_stats.sort_values('F1_Mean', ascending=False).reset_index(drop=True)
print(model_stats)


                             Model  Accuracy_Mean  Accuracy_Std  \
0        microsoft/deberta-v3-base         0.8467        0.0200   
1                     roberta-base         0.8244        0.0038   
2           microsoft/deberta-base         0.8222        0.0038   
3  cardiffnlp/twitter-roberta-base         0.8200        0.0115   
4                bert-base-uncased         0.8022        0.0139   
5    ddore14/RooseBERT-scr-uncased         0.7956        0.0102   

   Precision_Mean  Precision_Std  Recall_Mean  Recall_Std  F1_Mean  F1_Std  
0          0.8549         0.0148       0.8467      0.0200   0.8434  0.0219  
1          0.8265         0.0041       0.8244      0.0038   0.8223  0.0039  
2          0.8347         0.0022       0.8222      0.0038   0.8170  0.0050  
3          0.8275         0.0142       0.8200      0.0115   0.8161  0.0115  
4          0.8039         0.0157       0.8022      0.0139   0.7998  0.0135  
5          0.8077         0.0101       0.7956      0.0102   0.7889  

In [16]:

# Results to CSV
os.makedirs(f"{path}/experiments/text/Performance/", exist_ok=True)
output_file = f"{path}/experiments/text/Performance/model_comparison_results.csv"
model_stats.to_csv(output_file, index=False)
print(f"\nSummary results saved to: {output_file}")


Summary results saved to: /content/drive/MyDrive/multimodal-argmining/experiments/text/Performance/model_comparison_results.csv
