<a href="https://colab.research.google.com/github/BigTMiami/AdaptOrDie/blob/main/adapters_classification_no_pretraining_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary
This tests using an adapter for pretraining

# Things to change in real run
* Use the proper datasets (not dev for validation?)




# Setup

In [None]:
from google.colab import drive
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/AdaptOrDie'

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install adapters

In [None]:
!pip install pynvml

In [None]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    gpu_used = info.used//1024**2
    print(f"GPU {gpu_used} MB")
    return gpu_used

print_gpu_utilization()

In [None]:
from datasets import load_dataset

dataset_name = "BigTMiami/amazon_helpfulness"
train_1 = load_dataset(dataset_name, split="train[:5%]")
train_2 = load_dataset(dataset_name, split="train[5%:10%]")
# train_3 = load_dataset(dataset_name, split="train[10%:20%]")
# train_4 = load_dataset(dataset_name, split="train[20%:40%]")
# train_5 = load_dataset(dataset_name, split="train[40%:]")
validation_dataset = load_dataset(dataset_name, split="dev")

train_datasets = [train_1, train_2]
# train_datasets = [train_1, train_2, train_3, train_4, train_5]

print(train_1)
print(train_2)
# print(train_3)
# print(train_4)
# print(train_5)
print(validation_dataset)

print_gpu_utilization()

In [None]:
from transformers import RobertaConfig
from adapters import AutoAdapterModel

config = RobertaConfig.from_pretrained("roberta-base")
model = AutoAdapterModel.from_pretrained(
    "roberta-base",
    config=config,
)
print_gpu_utilization()

In [None]:
adapter_hub_name = "adapter_HUB_classification_no_pretraining_test"
adapter_name = "adapter_classification_seq_bn_no_pretraining"
adapter_type = "seq_bn" # could be "lora", etc.

# Add a new adapter
model.add_adapter(adapter_name, config=adapter_type)

# Add head for masked language modeling
model.add_classification_head(
    adapter_name,
    num_labels=2,
    id2label={ 0: "unhelpful", 1: "helpful"}
  )

# Set the adapter to be used for training
model.train_adapter(adapter_name)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
print_gpu_utilization()

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    f1 = f1_score(labels, preds, average='macro')

    return {
        'accuracy': accuracy,
        'f1_macro': f1
    }

# Train

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="adapter_classifier_training_output",
    learning_rate=1e-4,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=50,
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

In [None]:
from adapters import AdapterTrainer

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print_gpu_utilization()

In [None]:
all_results = []

In [None]:
for i, train_dataset in enumerate(train_datasets):
    saved_results = {}

    print(f"Training dataset {i}")
    training_args["output_dir"] = f"adapter_classifier_training_output_{i}"
    trainer = AdapterTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    training_results = trainer.train()
    saved_results["train"] = training_results
    print(training_results)

    eval_results = trainer.evaluate()
    saved_results["eval"] = eval_results
    print(eval_results)

    saved_results['gpu'] = print_gpu_utilization()

    all_results.append(saved_results)


In [None]:
for result in all_results:
  training_loss = result["train"]["loss"]
  training_time = result["train"]["metrics"]["train_runtime"]
  eval_loss = eval_results["eval_loss"]
  eval_f1_macro = 100.0 * eval_results["eval_f1_macro"]
  eval_accuracy = 100.0 * eval_results["eval_accuracy"]
  print(f"{i}: Eval F1: {f1_macro:.3f} Acc: {eval_accuracy:.3f} Loss: {eval_loss:.5f} || Training Loss: {training_loss:.4f} Time: {training_time:.0f}")

In [None]:
# print("Disconnecting Session")
# from google.colab import runtime
# runtime.unassign()