### Testing 5 different transformer models with 3 seeds each
### Task: Argument Stance Classification (Support/Oppose)

In [1]:
# Libraries
import os
import numpy as np
import pandas as pd
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Google Colab or not
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    path = "/content/drive/MyDrive/multimodal-argmining"
    os.chdir(path)
    print(f"Loading data from Google Drive: {path}")
else:
    path = "C:/Users/diego/Desktop/Master Neuro/M2/Intership_NLP/multimodal-argmining"
    os.chdir(path)
    print(f"Loading data locally from: {path}")


# GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU ready:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("No GPU detecting, using CPU.")

  from .autonotebook import tqdm as notebook_tqdm


Loading data locally from: C:/Users/diego/Desktop/Master Neuro/M2/Intership_NLP/multimodal-argmining
No GPU detecting, using CPU.


In [5]:
# WE define our Models to tests and the seeds
MODELS = [
    "roberta-base",                    
    "microsoft/deberta-v3-base",       
    "cardiffnlp/twitter-roberta-base", 
    "bert-base-uncased",               
    "microsoft/deberta-base",
    "ddore14/RooseBERT-scr-uncased"           
]

#Seeds
SEEDS = [42, 123, 456]  


print("\nModels to test:")
for i, model in enumerate(MODELS, 1):
    print(f"  {i}. {model}")



Models to test:
  1. roberta-base
  2. microsoft/deberta-v3-base
  3. cardiffnlp/twitter-roberta-base
  4. bert-base-uncased
  5. microsoft/deberta-base
  6. ddore14/RooseBERT-scr-uncased


In [9]:
#Load Dataset
train_path = f"{path}/data/gun_control_train.csv"
dev_path   = f"{path}/data/gun_control_dev.csv"
test_path  = f"{path}/data/gun_control_test.csv"

df_train = pd.read_csv(train_path)
df_dev   = pd.read_csv(dev_path)
df_test  = pd.read_csv(test_path)


# Map labels to ints
label2id = {"oppose": 0, "support": 1}
for df in [df_train, df_dev, df_test]:
    df["label"] = df["stance"].map(label2id)

print(df_train["label"].value_counts())
df_train.head()



dataset_train = Dataset.from_pandas(df_train[["tweet_text", "label"]])
dataset_dev   = Dataset.from_pandas(df_dev[["tweet_text", "label"]])
dataset_test  = Dataset.from_pandas(df_test[["tweet_text", "label"]])


label
1    475
0    448
Name: count, dtype: int64


In [11]:

# Tokenization Function for each model 
def tokenize_dataset(dataset, tokenizer, max_length=128):
    
    def tokenize_batch(batch):
        return tokenizer(batch["tweet_text"],padding="max_length",truncation=True,max_length=max_length)

    tokenized = dataset.map(tokenize_batch, batched=True)

    tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    
    return tokenized


In [6]:
# We define metrics Function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [None]:
# Training Function
def train_and_evaluate(model_name, seed, train_dataset, dev_dataset):

    # Set seed!
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    print(f"Training: {model_name} | Seed: {seed}")
    print(f"{'='*60}")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"Tokenizer loaded for {model_name}...")
    

    # Tokenize datasets with model-specific tokenizer
    train_dataset = tokenize_dataset(train_dataset, tokenizer, 128)
    dev_dataset = tokenize_dataset(dev_dataset, tokenizer, 128)
    print(f"Tokenization complete.")
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    print(f"Model Loaded: {model_name}.")
    
  
    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./temp_models/{model_name.replace('/', '_')}_seed{seed}",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        save_total_limit=1,
        report_to="none",
        logging_steps=10
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )
    
    # Train
    print(f"\n Starting training...")
    trainer.train()
    
    # Evaluate
    print(f"\n Evaluating on dev set...")
    eval_results = trainer.evaluate()
    
    # Extract metrics
    results = {
        "model": model_name,
        "seed": seed,
        "accuracy": eval_results["eval_accuracy"],
        "precision": eval_results["eval_precision"],
        "recall": eval_results["eval_recall"],
        "f1": eval_results["eval_f1"],
        "loss": eval_results["eval_loss"]
    }
    
    print(f"\n Results for {model_name} (seed {seed}):")
    print(f"   Accuracy:  {results['accuracy']:.4f}")
    print(f"   Precision: {results['precision']:.4f}")
    print(f"   Recall:    {results['recall']:.4f}")
    print(f"   F1-Score:  {results['f1']:.4f}")
    
    # Clean up to save memory
    del model
    del trainer
    del train_dataset
    del dev_dataset
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return results

In [None]:
# We run the Experiments
print("STARTING EXPERIMENT")
print("="*60)

all_results = []
total_experiments = len(MODELS) * len(SEEDS)
current_experiment = 0

for model_name in MODELS:
    print(f"# MODEL: {model_name}")
    
    for seed in SEEDS:
        current_experiment += 1
        print(f"\n[Experiment {current_experiment}/{total_experiments}]")
        results = train_and_evaluate(
            model_name=model_name,
            seed=seed,
            train_dataset=dataset_train,
            dev_dataset=dataset_dev)
        
        all_results.append(results)
           

print("EXPERIMENT COMPLETED")

In [None]:
# Results
results_df = pd.DataFrame(all_results)

# Per model we calculate the mean and std on each metric
model_stats = results_df.groupby('model').agg({
    'accuracy': ['mean', 'std'],
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1': ['mean', 'std']
}).round(4)

# Format
model_stats.columns = ['_'.join(col).strip() for col in model_stats.columns.values]
model_stats = model_stats.reset_index()
model_stats.columns = [
    'Model',
    'Accuracy_Mean', 'Accuracy_Std',
    'Precision_Mean', 'Precision_Std',
    'Recall_Mean', 'Recall_Std',
    'F1_Mean', 'F1_Std'
]

# Sort by F1 score
model_stats = model_stats.sort_values('F1_Mean', ascending=False).reset_index(drop=True)
print(model_stats)


In [None]:

# Results to CSV
os.makedirs(f"{path}/experiments/text/Performance/", exist_ok=True)
output_file = f"{path}/experiments/text/Performance/model_comparison_results.csv"
model_stats.to_csv(output_file, index=False)
print(f"\nSummary results saved to: {output_file}")