In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.getcwd()
os.chdir('/content/drive/My Drive/Colab Notebooks')
os.getcwd()

'/content/drive/My Drive/Colab Notebooks'

# Install the necessary packages to run this script

In [3]:
!pip install transformers datasets torch scikit-learn numpy



### Load the necessary packages to run the script

In [4]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, RobertaConfig
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score
from modifiers.ModifiedRobertaWithAdaptersV5 import ModifiedRobertaForSequenceClassification
from transformers import TrainerCallback


Now lets check your GPU availability and load some sanity checkers. By default you should be using your gpu for this assignment if you have one available.

In [5]:
# Check device availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("You are using device: %s" % device)

You are using device: cuda


## **1.2: Load Data**
Loading the ag_news dataset

In [6]:
# Load AG_NEWS dataset
dataset = load_dataset("fancyzhx/ag_news")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# load the tokenizer for Roberta
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [8]:
# define the function for tokenizing the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding = "max_length", truncation= True, max_length= 512)

# Apply the tokenizer to the datasets
tokenized_dataset = dataset.map(tokenize_function, batched= True)

# Set the format of the dataset to return PyTorch tensors
tokenized_datasets = tokenized_dataset.remove_columns(["text"])
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Split the dataset into train and test sets
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

In [9]:
small_train_dataset = train_dataset.select(range(1000))
small_test_dataset = test_dataset.select(range(100))

Setting the base model and the metric function to be used to evaluate the model

In [10]:
# # Load the pre-trained RoBERTa model
base_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
# def compute_metrics(p):
#     predictions, labels = p
#     predictions = torch.argmax(torch.tensor(predictions), dim=-1)
#     accuracy = accuracy_score(labels, predictions)
#     return {'accuracy': accuracy}

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Get the predicted class by finding the index of the max logit

    # Accuracy
    acc = accuracy_score(labels, preds)

    # Precision, Recall, F1 Score (using macro, micro, or weighted average)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, zero_division=0.0, average="weighted")

    # # Confusion matrix (for multi-class, it returns a matrix)
    # cm = confusion_matrix(labels, preds)

    # Extract True Positives, False Positives, etc. for each class
    # Here, we will return a dictionary with confusion matrix components per class (if needed)
    # tn, fp, fn, tp = cm.ravel() if cm.shape == (2, 2) else (None, None, None, None)

    # For multi-class, it's more useful to look at the entire confusion matrix
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        # 'confusion_matrix': cm.tolist(),  # Return the confusion matrix for more insight
    }



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting the training options and hyper-parameter settings. This is going to be the same across all experiments

In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    run_name="roberta_ag_news_ablation_adapter",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=128,
    num_train_epochs=5,
    weight_decay=0.001,
    lr_scheduler_type="linear",

    logging_dir='./logs',
    logging_steps=100,
    save_strategy="epoch",               # Save checkpoints every epoch
    load_best_model_at_end=True,         # Load best model after training
    metric_for_best_model="accuracy",    # Metric to monitor
    fp16=True,
    # report_to=["none"],  # Disable W&B logging
    #logging_dir='./logs',
    #logging_steps=10,
    # report_to=["none"],  # Disable W&B logging
    # fp16=True  # Enables mixed precision
)



# Please run this below cell when we want to train on all the data. Current setup is just to test if the training is working





In [12]:
trainAllData = True
if trainAllData:
  small_train_dataset = train_dataset
  small_test_dataset = test_dataset

The Following section setup the different types of Models for training on the ag_news dataset
# Base model with the classification head is finetuned.
The whole model is finetuned to ag_news dataset

In [13]:
# This will display the model structure and the layer structure
base_model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [14]:
training_logs = []

class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        logs = logs or {}
        if any(key.startswith("eval_") for key in logs):
            training_logs.append({
                "epoch": state.epoch,
                "loss": logs.get("eval_loss", None),  # Get loss if available, otherwise None
                "accuracy": logs.get("eval_accuracy", None)
            })

trainer_base = Trainer(
    model=base_model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=small_train_dataset,  # training dataset
    eval_dataset=small_test_dataset,   # evaluation dataset
    compute_metrics=compute_metrics,     # function for computing metrics
)

trainer_base.add_callback(CustomCallback)

In [15]:
trainer_base.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgtechankur[0m ([33mgtechankur-geaorgia-tech[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2557,0.225693,0.933684,0.93391,0.933684,0.933732
2,0.1934,0.297547,0.926184,0.928893,0.926184,0.926511
3,0.2021,0.232183,0.941184,0.941611,0.941184,0.941165
4,0.1381,0.23207,0.945789,0.946259,0.945789,0.945886
5,0.0725,0.240734,0.946579,0.946637,0.946579,0.946575


TrainOutput(global_step=37500, training_loss=0.19198792372385662, metrics={'train_runtime': 2969.4752, 'train_samples_per_second': 202.056, 'train_steps_per_second': 12.628, 'total_flos': 1.578694680576e+17, 'train_loss': 0.19198792372385662, 'epoch': 5.0})

In [16]:
from google.colab import drive
drive.mount('/content/drive')

base_model.save_pretrained("/content/drive/My Drive/1_base_model_ag_news_ablation_with_adapter")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Base model only the classification head is finetuned.
The classification head is finetuned to ag_news dataset

In [17]:
base_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
model = base_model
for param in model.base_model.parameters():
    param.requires_grad = False  # Freeze the encoder layers

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
trainer_base_ch_only = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=small_train_dataset,  # training dataset
    eval_dataset=small_test_dataset,   # evaluation dataset
    compute_metrics=compute_metrics,     # function for computing metrics
)
trainer_base_ch_only.add_callback(CustomCallback)

In [19]:
trainer_base_ch_only.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.432,0.325215,0.891447,0.891696,0.891447,0.891203
2,0.3751,0.302106,0.899737,0.899799,0.899737,0.899661
3,0.3768,0.295422,0.901447,0.901463,0.901447,0.901383
4,0.4019,0.292357,0.901842,0.901574,0.901842,0.901649
5,0.3617,0.291549,0.902105,0.901877,0.902105,0.901923


TrainOutput(global_step=37500, training_loss=0.430357732035319, metrics={'train_runtime': 1115.3104, 'train_samples_per_second': 537.967, 'train_steps_per_second': 33.623, 'total_flos': 1.578694680576e+17, 'train_loss': 0.430357732035319, 'epoch': 5.0})

In [20]:
from google.colab import drive
drive.mount('/content/drive')

model.save_pretrained("/content/drive/My Drive/2_base_model_ag_news_ablation_with_adapter")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Custom model with no adapters used. All parameters fine tuned




In [21]:
base_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
custom_model = ModifiedRobertaForSequenceClassification(base_model ,num_labels=4, adapter_hidden_dim=64, freeze_params = True, adapter_layers = [False] * 12)
# View the updated custom model. All encoder blocks should be similar and should have a pass through block instead of the adapter block
custom_model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ModifiedRobertaForSequenceClassification(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (modified_layers): ModuleList(
    (0-11): 12 x ModifiedRobertaLayer(
      (adapter1): PassThroughBlock()
      (adapter2): PassThroughBlock()
      (attention): RobertaAttention(
        (self): RobertaSdpaSelfAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (output): RobertaSelfOutput(
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (LayerNorm): La

In [22]:
trainer_custom_model = Trainer(
    model=custom_model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=small_train_dataset,  # training dataset
    eval_dataset=small_test_dataset,   # evaluation dataset
    compute_metrics=compute_metrics,     # function for computing metrics
)

trainer_custom_model.add_callback(CustomCallback)

In [23]:
trainer_custom_model.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3902,1.386475,0.25,0.0625,0.25,0.1
2,1.3902,1.386475,0.25,0.0625,0.25,0.1
3,1.3873,1.386475,0.25,0.0625,0.25,0.1
4,1.3875,1.386475,0.25,0.0625,0.25,0.1
5,1.3869,1.385986,0.25,0.0625,0.25,0.1


TrainOutput(global_step=37500, training_loss=1.3881010896809896, metrics={'train_runtime': 2837.9369, 'train_samples_per_second': 211.421, 'train_steps_per_second': 13.214, 'total_flos': 0.0, 'train_loss': 1.3881010896809896, 'epoch': 5.0})

In [26]:



from google.colab import drive
drive.mount('/content/drive')

custom_model.save_pretrained("/content/drive/My Drive/3_base_custom_model_ag_news_ablation_with_adapter")





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


AttributeError: 'ModifiedRobertaForSequenceClassification' object has no attribute 'save_pretrained'

# Custom model with no adapters used. Classification head fine tuned

In [28]:
# set-requires_grad will freeze all original layer parameters and only train the adapters and classification heads
base_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
custom_model = ModifiedRobertaForSequenceClassification(base_model ,num_labels=4, adapter_hidden_dim=64, freeze_params = True, adapter_layers = [False] * 12)
custom_model.set_requires_grad(False)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
trainer_custom_model_ch_only = Trainer(
    model=custom_model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=small_train_dataset,  # training dataset
    eval_dataset=small_test_dataset,   # evaluation dataset
    compute_metrics=compute_metrics,     # function for computing metrics
)

trainer_custom_model_ch_only.add_callback(CustomCallback)

In [None]:
trainer_custom_model_ch_only.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9402,1.135082,0.532895,0.634459,0.532895,0.51704


In [None]:
from google.colab import drive
drive.mount('/content/drive')

custom_model.save_pretrained("/content/drive/My Drive/4_custom_model_ag_news_ablation_with_adapter")



# Custom model with all layers with adapters used. Adapters and Classification head fine tuned only

In [None]:
base_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
model_all_adapters = ModifiedRobertaForSequenceClassification(base_model ,num_labels=4, adapter_hidden_dim=64, freeze_params = True, adapter_layers = [True] * 12)
# View the updated custom model. All encoder blocks should be similar and should have a pass through block instead of the adapter block
model_all_adapters.set_requires_grad(False)
model_all_adapters

In [None]:
trainer_model_all_adapters = Trainer(
    model=model_all_adapters,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=small_train_dataset,  # training dataset
    eval_dataset=small_test_dataset,   # evaluation dataset
    compute_metrics=compute_metrics,     # function for computing metrics
)

trainer_model_all_adapters.add_callback(CustomCallback)

In [None]:
trainer_model_all_adapters.train()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model_all_adapters.save_pretrained("/content/drive/My Drive/5_model_all_adapters_ag_news_ablation_with_adapter")



# Custom model with lower layers(2) with adapters used. Adapters and Classification head fine tuned only




In [None]:
adpt_lyrs = [False]*12
adpt_lyrs[0:2] = [True]*2
base_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
model_lower_adapters = ModifiedRobertaForSequenceClassification(base_model ,num_labels=4, adapter_hidden_dim=64, freeze_params = True, adapter_layers = adpt_lyrs)
model_lower_adapters.set_requires_grad(False)
# View the updated custom model. All encoder blocks should be similar and should have a pass through block instead of the adapter block
model_lower_adapters

In [None]:
trainer_model_lower_adapters = Trainer(
    model=model_lower_adapters,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=small_train_dataset,  # training dataset
    eval_dataset=small_test_dataset,   # evaluation dataset
    compute_metrics=compute_metrics,     # function for computing metrics
)

trainer_model_lower_adapters.add_callback(CustomCallback)

In [None]:
trainer_model_lower_adapters.train()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model_lower_adapters.save_pretrained("/content/drive/My Drive/6_model_lower_adapters_ag_news_ablation_with_adapter")



# Custom model with higher layers(2) with adapters used. Adapters and Classification head fine tuned only

In [None]:
adpt_lyrs = [False]*12
adpt_lyrs[10:] = [True]*2
base_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
model_higher_adapters = ModifiedRobertaForSequenceClassification(base_model ,num_labels=4, adapter_hidden_dim=64, freeze_params = True, adapter_layers = adpt_lyrs)
model_higher_adapters.set_requires_grad(False)
# View the updated custom model. All encoder blocks should be similar and should have a pass through block instead of the adapter block
model_higher_adapters

In [None]:
trainer_model_higher_adapters = Trainer(
    model=model_higher_adapters,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=small_train_dataset,  # training dataset
    eval_dataset=small_test_dataset,   # evaluation dataset
    compute_metrics=compute_metrics,     # function for computing metrics
)

trainer_model_higher_adapters.add_callback(CustomCallback)

In [None]:
trainer_model_higher_adapters.train()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model_higher_adapters.save_pretrained("/content/drive/My Drive/7_model_higher_adapters_ag_news_ablation_with_adapter")

