<a href="https://colab.research.google.com/github/FahadEbrahim/STR_LoRa/blob/main/STR_LoRa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to run

AdaptIRC:


* This notebook implements the LoRa meets adapters on the SEMEVAL 2024 Semantic Relatedness Task.

* To run the notebook in Colab, just change the environment to GPU through: Runtime >> Change runtime type >> Hardware Accelerator >> GPU.

* You may require WANDB token if using newer versions of transformers lib

In [1]:
!pip install -Uq adapters
!pip install -q datasets
!pip install -Uq accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.8/252.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ibis-framework 7.1.0 requires pyarrow<15,>=2, but you have pyarrow 15.0.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m2.3 MB/s

# Import Libraires

Here, we are importing libraries that would be used throughout the notebook. (Pandas, Json, OS, Sklearn, numpy, collections, transformers, adapters, random, torch ).

In [2]:
from transformers import TrainingArguments, EvalPrediction, TrainerCallback, DataCollatorWithPadding
import pandas as pd

from datasets import Dataset
import pandas as pd

from transformers import RobertaTokenizer, RobertaConfig, TextClassificationPipeline,AutoModel,AutoConfig,AutoTokenizer
from adapters import RobertaAdapterModel,AutoAdapterModel

import torch
import random
from transformers import set_seed
import numpy as np

from adapters import AdapterTrainer

# Setting Seed

These lines set the seed for reproducability for several libraries ( torch, random, numpy, transformers)

In [55]:
RANDOM_SEED = 42

set_seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Dataset

Reading the dataset cloned from SEMEVAL Github repository:

In [56]:
lang = "ary"
train_link = "https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/" + lang + "/" + lang + "_train.csv"
dev_link  = "https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/" + lang + "/" + lang + "_dev_with_labels.csv"
test_link = "https://raw.githubusercontent.com/semantic-textual-relatedness/Semantic_Relatedness_SemEval2024/main/Track%20A/" + lang + "/" + lang + "_test.csv"

train_ds = pd.read_csv(train_link)
dev_ds = pd.read_csv(dev_link)
test_ds = pd.read_csv(test_link)

In [57]:
# Splitting the input to two sentences
train_ds[['sent1', 'sent2']] = train_ds['Text'].str.split('\n', 1, expand=True)
dev_ds[['sent1', 'sent2']] = dev_ds['Text'].str.split('\n', 1, expand=True)
test_ds[['sent1', 'sent2']] = test_ds['Text'].str.split('\n', 1, expand=True)

  train_ds[['sent1', 'sent2']] = train_ds['Text'].str.split('\n', 1, expand=True)
  dev_ds[['sent1', 'sent2']] = dev_ds['Text'].str.split('\n', 1, expand=True)
  test_ds[['sent1', 'sent2']] = test_ds['Text'].str.split('\n', 1, expand=True)


# Dataset Processing

In [58]:
train_dataset = Dataset.from_pandas(train_ds)
val_dataset = Dataset.from_pandas(dev_ds)
test_dataset = Dataset.from_pandas(test_ds)

# Model Configuration

Here is the new important code: Setting the configurations of the adapters and transformer model.

In [59]:
def create_model(model_name="cross-encoder/stsb-roberta-base", max_length=256, truncation=True, padding="max_length", device="cuda"):
  # The tokenizer is based on Roberta. The configurations are: Max_length = 256, truncation = true, padding = max_length.
  tokenizer = AutoTokenizer.from_pretrained(model_name, device=device, max_length=max_length, truncation=truncation, padding=padding)

  # Configuration: We have 3 labels: Bug, Enhancment, Question.
  config = AutoConfig.from_pretrained(model_name, device=device, num_labels=1)

  # Configuration of the Adapter model.
  model = AutoAdapterModel.from_pretrained(model_name, config=config)

  # This part is for inferencing
  classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device, max_length=max_length, padding=padding, truncation=truncation)

  return tokenizer, model, classifier

# Creating Training and Infering Adapters

* A Regression head is attached to the model defining the number of labels to be 1 (Regression)
* Initilaising the training of the Adapter
* Using Adapter Droput Trainer as the Callback.
* Configuring the adapter configuarion.
* Configure the trainer
* Adding the callback.
* Start training the adapter
* Evalauting the adapter

In [60]:
# Setting the Hyperparameters

learning_rate=5e-4
epochs=10
batch_size=30

In [61]:
# Initilaizing the Model
tokenizer, model, classifier = create_model()

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at cross-encoder/stsb-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'RobertaAdapterModel' is not supported for . Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassif

In [62]:
# Encoding
enc_train = train_dataset.map(lambda e: tokenizer( e['sent1'],e['sent2'], padding=True, truncation=True), batched=True)
enc_val = val_dataset.map(lambda e: tokenizer( e['sent1'],e['sent2'], padding=True, truncation=True), batched=True)
enc_test = test_dataset.map(lambda e: tokenizer( e['sent1'],e['sent2'], padding=True, truncation=True), batched=True)

Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

In [63]:
# Formatting the columns
enc_train=enc_train.rename_column('Score','label')
enc_val=enc_val.rename_column('Score','label')

enc_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
enc_val.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
enc_test.set_format(type="torch", columns=["input_ids", "attention_mask"])


In [64]:
from scipy.stats import spearmanr, pearsonr
from adapters import DoubleSeqBnConfig,LoRAConfig,IA3Config,SeqBnConfig,ParBnConfig,SeqBnInvConfig, DoubleSeqBnInvConfig

adapter_name = "STR_Adapter_" + lang

config = LoRAConfig(r=8, alpha=8)
#config = SeqBnConfig()  # This is used for Pffeifer Adapter
#config = DoubleSeqBnConfig()  # This is used for Houslby Adapter

model.add_adapter(adapter_name, config = config, overwrite_ok=True)

# Add a matching classification head
model.add_classification_head(
    adapter_name,
    num_labels=1,   # Num of Labels = 1 >> Regression
    overwrite_ok=True
)

# Initilaize the adapter training
model.train_adapter(adapter_name)

In [65]:
# Metrics used for evaluation ( Regression Metrics )
def compute_metrics(pred):
  preds = np.squeeze(pred.predictions)
  return {
  "MSE": ((preds - pred.label_ids) ** 2).mean().item(),
  "RMSE": (np.sqrt (( (preds - pred.label_ids) ** 2).mean())).item(),
  "MAE": (np.abs(preds - pred.label_ids)).mean().item(),
  "Pearson" : pearsonr(preds,pred.label_ids)[0],
  "Spearman's Rank":spearmanr(preds,pred.label_ids)[0]}

In [66]:
training_args = TrainingArguments(
    output_dir=f"training_output/{adapter_name}",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    fp16=True,
    load_best_model_at_end=True,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    overwrite_output_dir=True,
    remove_unused_columns=False,
    save_strategy="steps",
    seed=RANDOM_SEED,
    #fp16=True, # bfloat16 training
   	#torch_compile=True, # optimizations
    #optim="adamw_torch_fused", # improved optimizer
)

In [67]:
# Having a data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [68]:
# Configure the Adapter Trainer
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    eval_dataset=enc_val,
    train_dataset=enc_train,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [69]:
# import torch._dynamo
# torch._dynamo.config.suppress_errors = True

In [70]:
# Start training the adapter
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson,Spearman's rank
100,0.1109,0.017272,0.017272,0.131425,0.104906,0.774309,0.795205
200,0.0213,0.015324,0.015324,0.123791,0.092343,0.812807,0.82592
300,0.0184,0.020815,0.020815,0.144272,0.108178,0.825893,0.836071


TrainOutput(global_step=310, training_loss=0.049214643672589334, metrics={'train_runtime': 96.2655, 'train_samples_per_second': 95.985, 'train_steps_per_second': 3.22, 'total_flos': 2473045549793280.0, 'train_loss': 0.049214643672589334, 'epoch': 10.0})

In [71]:
evaluation = trainer.evaluate()
display(evaluation)

{'eval_loss': 0.01704072579741478,
 'eval_MSE': 0.01704072766005993,
 'eval_RMSE': 0.130540132522583,
 'eval_MAE': 0.0941530093550682,
 'eval_Pearson': 0.8258843301996432,
 "eval_Spearman's Rank": 0.8340032873361537,
 'eval_runtime': 0.2939,
 'eval_samples_per_second': 241.609,
 'eval_steps_per_second': 10.209,
 'epoch': 10.0}

In [72]:
# Save the adapter
model.save_adapter(f"training_output/{adapter_name}", adapter_name)

# Merging the Repo
model.merge_adapter(adapter_name)

In [73]:

dev_ds ['Pred_Score']= trainer.predict(enc_val)[0]
dev_ds ['Pred_Score'] = dev_ds['Pred_Score'].round(2)

test_ds ['Pred_Score']= trainer.predict(enc_test)[0]
test_ds ['Pred_Score'] = test_ds['Pred_Score'].round(2)

In [74]:
filename = 'pred_' + lang + '_a.csv'
# For Dev:
# dev_ds[['PairID', 'Pred_Score']].to_csv(filename, index=False)

# For Test:
test_ds[['PairID', 'Pred_Score']].to_csv(filename, index=False)
saved_filename = 'CERoberta' + '_' + lang + '_' + config.architecture + '_e' + str(epochs) + '_bs' + str(batch_size)

In [75]:
# This zip file will be uploaded to CodaLab.
!zip {saved_filename} {filename}

updating: pred_ary_a.csv (deflated 80%)
