<a href="https://colab.research.google.com/github/BigTMiami/AdaptOrDie/blob/main/ZZ_test_best_classification_score_adp_seq_bn_P_micro_seq_bn_C_micro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary
This creates one adapter for pre-training and another for classifcation.  

* The pre-training adapter is not trained during the classification, only the classifcation adapter.
* Naming Convention - (pretraining adapter type)_P_(pretraining data size)_(classification adapter type)_C_(classification_data_size)
* This uses "micro" sizes - just to demonstrate the script works

# TODO
* Set eval steps in training
* Set logging steps more appropriately





# Setup

In [1]:
from time import time
start_time = time()

In [2]:
experiment_tag = "ZZ"

# Pretraining Dataset Settings
pretrain_dataset_name = "BigTMiami/amazon_split_25M_reviews_20_percent_condensed"
pretrain_percentage_int = 20
pretrain_percentage = f"{pretrain_percentage_int}"
pretrain_train_split = f"train[:{pretrain_percentage}%]"
pretrain_eval_split = "validation"
# The dataset I am using is only 20%, so the final is 20% of the pretrain_percentage_int
pretrain_percentage_final = int(0.2 * pretrain_percentage_int)

# MICRO Dataset Settings - comment out for regular run
pretrain_dataset_name = "BigTMiami/amazon_25M_10_000_condensed"
pretrain_train_split = f"train"
pretrain_eval_split = "validation"
pretrain_percentage_final = "micro"

# Pretraining Adapter settings
pretraining_adapter_type = "seq_bn"
pretraining_tag = f"{pretraining_adapter_type}_P_{pretrain_percentage_final}"
pretrained_adapter_hub_name = f"{experiment_tag}_{pretraining_tag}"
pretrained_adapter_name = f"{pretrained_adapter_hub_name}_A"
print(f"Pretrained Adapter Hub Name: {pretrained_adapter_hub_name}")

# Classification Dataset Settings
classification_dataset_name = "BigTMiami/amazon_helpfulness"
classification_train_percentage = "5"
classification_train_split = f"train[:{classification_train_percentage}%]"
classification_eval_split = "dev"
classfication_percentage_final = "micro"

# Classification Adapter settings
classification_adapter_type = "seq_bn"
classifier_tag = f"{classification_adapter_type}_C_{classfication_percentage_final}"

# Classifier - no pretraining
classification_adapter_hub_name = f"{experiment_tag}_{classifier_tag}"
classification_adapter_name = f"{classification_adapter_hub_name}_A"
print(f"Classifer Not Pretrained Adapter Hub Name: {classification_adapter_hub_name}")

# Classifier with Pretrained
classification_adapter_after_pretrained_hub_name = f"{experiment_tag}_{pretraining_tag}_{classifier_tag}"
classification_adapter_after_pretrained_name = f"{classification_adapter_after_pretrained_hub_name}_A"
print(f"Classifer after Pretrained Adapter Hub Name: {classification_adapter_after_pretrained_hub_name}")


Pretrained Adapter Hub Name: ZZ_seq_bn_P_micro
Classifer Not Pretrained Adapter Hub Name: ZZ_seq_bn_C_micro
Classifer after Pretrained Adapter Hub Name: ZZ_seq_bn_P_micro_seq_bn_C_micro


In [3]:
# from google.colab import drive
# drive.mount("/content/drive")
# %cd '/content/drive/MyDrive/AdaptOrDie'

In [4]:
!pip install datasets
!pip install transformers[torch]
!pip install adapters
!pip install scikit-learn

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datase

In [5]:
!pip install pynvml

Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.0


In [6]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    gpu_used = info.used//1024**2
    print(f"GPU {gpu_used} MB")

print_gpu_utilization()

GPU 448 MB


In [7]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    f1 = f1_score(labels, preds, average='macro')

    return {
        'accuracy': accuracy,
        'f1_macro': f1
    }

# Classification - Shared Setup

In [8]:
# THIS MUST BE SET TO MATCH  classification_adapter_type
from adapters import SeqBnConfig

classification_adapter_config = SeqBnConfig()

  _torch_pytree._register_pytree_node(


In [9]:
print(classification_adapter_config.__class__.__name__)
print(classification_adapter_type)

SeqBnConfig
seq_bn


In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [11]:
from transformers import DataCollatorWithPadding

classification_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Classification

In [12]:
from datasets import load_dataset

classification_train_dataset, classification_eval_dataset= load_dataset(classification_dataset_name, split=[classification_train_split,classification_eval_split])

print(classification_train_dataset)
print(classification_eval_dataset)

Downloading readme:   0%|          | 0.00/613 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/40.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 5763
})
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 5000
})


In [13]:
from transformers import RobertaConfig
from adapters import AutoAdapterModel

config = RobertaConfig.from_pretrained("roberta-base")
classficiation_no_pretrain_model = AutoAdapterModel.from_pretrained(
    "roberta-base",
    config=config,
)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Add a new adapter
classficiation_no_pretrain_model.add_adapter(classification_adapter_name, config=classification_adapter_config)

# Add head for classification modeling
classficiation_no_pretrain_model.add_classification_head(
    classification_adapter_name,
    num_labels=2,
    id2label={ 0: "unhelpful", 1: "helpful"})

# Set the adapter to be used for training
classficiation_no_pretrain_model.train_adapter(classification_adapter_name)

print(f"Classification Adapter Name: {classification_adapter_name}")

Classification Adapter Name: ZZ_seq_bn_C_micro_A


In [15]:
classficiation_no_pretrain_model.active_adapters

Stack[ZZ_seq_bn_C_micro_A]

In [16]:
summary = classficiation_no_pretrain_model.adapter_summary()
print(summary)

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
ZZ_seq_bn_C_micro_A      bottleneck          894,528       0.718       1       1
--------------------------------------------------------------------------------
Full model                               124,645,632     100.000               0


In [17]:
from transformers import TrainingArguments

classification_training_args = TrainingArguments(
    output_dir=classification_adapter_hub_name,
    learning_rate=1e-4,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # Logging and eval during training settings
    logging_steps=25,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_macro",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

In [18]:
from adapters import AdapterTrainer

classification_trainer = AdapterTrainer(
    model=classficiation_no_pretrain_model,
    args=classification_training_args,
    train_dataset=classification_train_dataset,
    eval_dataset=classification_eval_dataset,
    data_collator=classification_data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [19]:
classification_training_results = classification_trainer.train()
print(classification_training_results)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.3947,0.357227,0.8534,0.460451
2,0.3075,0.353193,0.8558,0.482263
3,0.3062,0.357266,0.8582,0.525374




TrainOutput(global_step=1083, training_loss=0.3514890721410045, metrics={'train_runtime': 393.9434, 'train_samples_per_second': 43.887, 'train_steps_per_second': 2.749, 'total_flos': 4377432583808814.0, 'train_loss': 0.3514890721410045, 'epoch': 3.0})


In [20]:
import pandas as pd

pd.DataFrame(classification_trainer.state.log_history)

Unnamed: 0,loss,learning_rate,epoch,step,eval_loss,eval_accuracy,eval_f1_macro,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,0.5318,9.76916e-05,0.07,25,,,,,,,,,,,
1,0.4018,9.538319e-05,0.14,50,,,,,,,,,,,
2,0.3868,9.307479e-05,0.21,75,,,,,,,,,,,
3,0.3951,9.076639e-05,0.28,100,,,,,,,,,,,
4,0.426,8.845799e-05,0.35,125,,,,,,,,,,,
5,0.3829,8.614958e-05,0.42,150,,,,,,,,,,,
6,0.3262,8.384118e-05,0.48,175,,,,,,,,,,,
7,0.4144,8.153278e-05,0.55,200,,,,,,,,,,,
8,0.3339,7.922438e-05,0.62,225,,,,,,,,,,,
9,0.4184,7.691597e-05,0.69,250,,,,,,,,,,,


In [21]:
classification_eval_results = classification_trainer.evaluate()

print(classification_eval_results)

{'eval_loss': 0.3572659492492676, 'eval_accuracy': 0.8582, 'eval_f1_macro': 0.5253735056127093, 'eval_runtime': 37.7238, 'eval_samples_per_second': 132.542, 'eval_steps_per_second': 8.297, 'epoch': 3.0}


In [22]:
import pandas as pd

pd.DataFrame(classification_trainer.state.log_history)

Unnamed: 0,loss,learning_rate,epoch,step,eval_loss,eval_accuracy,eval_f1_macro,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,0.5318,9.76916e-05,0.07,25,,,,,,,,,,,
1,0.4018,9.538319e-05,0.14,50,,,,,,,,,,,
2,0.3868,9.307479e-05,0.21,75,,,,,,,,,,,
3,0.3951,9.076639e-05,0.28,100,,,,,,,,,,,
4,0.426,8.845799e-05,0.35,125,,,,,,,,,,,
5,0.3829,8.614958e-05,0.42,150,,,,,,,,,,,
6,0.3262,8.384118e-05,0.48,175,,,,,,,,,,,
7,0.4144,8.153278e-05,0.55,200,,,,,,,,,,,
8,0.3339,7.922438e-05,0.62,225,,,,,,,,,,,
9,0.4184,7.691597e-05,0.69,250,,,,,,,,,,,


In [23]:
print("Disconnecting Session")
from google.colab import runtime
runtime.unassign()

Disconnecting Session
