<a href="https://colab.research.google.com/github/BigTMiami/AdaptOrDie/blob/main/AAb_adp_seq_bn_P_micro_seq_bn_C_micro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary
This creates one adapter for pre-training and another for classifcation.  

* The pre-training adapter is not trained during the classification, only the classifcation adapter.
* Naming Convention - (pretraining adapter type)_P_(pretraining data size)_(classification adapter type)_C_(classification_data_size)
* This uses "micro" sizes - just to demonstrate the script works

# TODO
* Set eval steps in training
* Set logging steps more appropriately





# Setup

In [2]:
from time import time
start_time = time()

In [3]:
experiment_tag = "AA"

# Pretraining Dataset Settings
pretrain_dataset_name = "BigTMiami/amazon_split_25M_reviews_20_percent_condensed"
pretrain_percentage_int = 20
pretrain_percentage = f"{pretrain_percentage_int}"
pretrain_train_split = f"train[:{pretrain_percentage}%]"
pretrain_eval_split = "validation"
# The dataset I am using is only 20%, so the final is 20% of the pretrain_percentage_int
pretrain_percentage_final = int(0.2 * pretrain_percentage_int)

# MICRO Dataset Settings - comment out for regular run
pretrain_dataset_name = "BigTMiami/amazon_25M_10_000_condensed"
pretrain_train_split = f"train"
pretrain_eval_split = "validation"
pretrain_percentage_final = "micro"

# Pretraining Adapter settings
pretraining_adapter_type = "seq_bn"
pretraining_tag = f"{pretraining_adapter_type}_P_{pretrain_percentage_final}"
pretrained_adapter_hub_name = f"{experiment_tag}_{pretraining_tag}"
pretrained_adapter_name = f"{pretrained_adapter_hub_name}_A"
print(f"Pretrained Adapter Hub Name: {pretrained_adapter_hub_name}")

# Classification Dataset Settings
classification_dataset_name = "BigTMiami/amazon_helpfulness"
classification_train_percentage = "1"
classification_train_split = f"train[:{classification_train_percentage}%]"
classification_eval_split = "dev"
classfication_percentage_final = "micro"

# Classification Adapter settings
classification_adapter_type = "seq_bn"
classifier_tag = f"{classification_adapter_type}_C_{classfication_percentage_final}"

# Classifier - no pretraining
classification_adapter_hub_name = f"{experiment_tag}_{classifier_tag}"
classification_adapter_name = f"{classification_adapter_hub_name}_A"
print(f"Classifer Not Pretrained Adapter Hub Name: {classification_adapter_hub_name}")

# Classifier with Pretrained
classification_adapter_after_pretrained_hub_name = f"{experiment_tag}_{pretraining_tag}_{classifier_tag}"
classification_adapter_after_pretrained_name = f"{classification_adapter_after_pretrained_hub_name}_A"
print(f"Classifer after Pretrained Adapter Hub Name: {classification_adapter_after_pretrained_hub_name}")


Pretrained Adapter Hub Name: AA_seq_bn_P_micro
Classifer Not Pretrained Adapter Hub Name: AA_seq_bn_C_micro
Classifer after Pretrained Adapter Hub Name: AA_seq_bn_P_micro_seq_bn_C_micro


In [4]:
# from google.colab import drive
# drive.mount("/content/drive")
# %cd '/content/drive/MyDrive/AdaptOrDie'

In [5]:
!pip install datasets
!pip install transformers[torch]
!pip install adapters
!pip install scikit-learn

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [6]:
!pip install pynvml

Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynvml
Successfully installed pynvml-11.5.0


In [7]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    gpu_used = info.used//1024**2
    print(f"GPU {gpu_used} MB")

print_gpu_utilization()

GPU 448 MB


In [8]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    f1 = f1_score(labels, preds, average='macro')

    return {
        'accuracy': accuracy,
        'f1_macro': f1
    }

# Pretraining

In [9]:
from datasets import load_dataset

pretrain_train_dataset, pretrain_eval_dataset = load_dataset(pretrain_dataset_name, split=[pretrain_train_split,pretrain_eval_split])

print(pretrain_train_dataset)
print(pretrain_eval_dataset)

Downloading readme:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.86M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/866 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1718
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 866
})


In [10]:
from transformers import RobertaConfig
from adapters import AutoAdapterModel

config = RobertaConfig.from_pretrained("roberta-base")
pretrain_model = AutoAdapterModel.from_pretrained(
    "roberta-base",
    config=config,
)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# THIS MUST BE SET TO MATCH  pretraining_adapter_type
from adapters import SeqBnConfig

pretraining_adapter_config = SeqBnConfig()

In [12]:
print(pretraining_adapter_config.__class__.__name__)
print(pretraining_adapter_type)

SeqBnConfig
seq_bn


In [13]:
# Add a new adapter
pretrain_model.add_adapter(pretrained_adapter_name, config=pretraining_adapter_config)

# Add head for masked language modeling
pretrain_model.add_masked_lm_head(pretrained_adapter_name)

# Set the adapter to be used for training
pretrain_model.train_adapter(pretrained_adapter_name)

print(f"Pretrained Adapter Name: {pretrained_adapter_name}")

Pretrained Adapter Name: AA_seq_bn_P_micro_A


In [14]:
pretrain_model.active_adapters

Stack[AA_seq_bn_P_micro_A]

In [15]:
summary = pretrain_model.adapter_summary()
print(summary)

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
AA_seq_bn_P_micro_A      bottleneck          894,528       0.718       1       1
--------------------------------------------------------------------------------
Full model                               124,645,632     100.000               0


In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
print_gpu_utilization()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

GPU 448 MB


In [17]:
from transformers import DataCollatorForLanguageModeling

pretraining_data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

In [18]:
from transformers import TrainingArguments

pretraining_args = TrainingArguments(
    output_dir=pretrained_adapter_hub_name,
    learning_rate=1e-4,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=11,
    weight_decay=0.01,
    # Logging and eval during training settings
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=100,
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

In [19]:
from adapters import AdapterTrainer

pretrain_trainer = AdapterTrainer(
    model=pretrain_model,
    args=pretraining_args,
    train_dataset=pretrain_train_dataset,
    eval_dataset=pretrain_eval_dataset,
    data_collator=pretraining_data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [20]:
before_pretrain_eval_results = pretrain_trainer.evaluate()
before_pretrain_eval_results_loss = before_pretrain_eval_results["eval_loss"]
print(f"Before pretraining loss: {before_pretrain_eval_results_loss:.4f}")
print(before_pretrain_eval_results )

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Before pretraining loss: 17.8105
{'eval_loss': 17.810522079467773, 'eval_runtime': 10.4972, 'eval_samples_per_second': 82.498, 'eval_steps_per_second': 2.667}


In [21]:
pretrain_results = pretrain_trainer.train()
print(pretrain_results)

Step,Training Loss,Validation Loss


TrainOutput(global_step=4, training_loss=16.945280075073242, metrics={'train_runtime': 31.339, 'train_samples_per_second': 54.82, 'train_steps_per_second': 0.128, 'total_flos': 379880065990656.0, 'train_loss': 16.945280075073242, 'epoch': 0.81})


In [22]:
after_pretrain_eval_results = pretrain_trainer.evaluate()
after_pretrain_eval_results_loss = after_pretrain_eval_results["eval_loss"]
print(f"After pretraining loss: {after_pretrain_eval_results_loss:.4f}")
print(after_pretrain_eval_results)

After pretraining loss: 16.2281
{'eval_loss': 16.22808074951172, 'eval_runtime': 9.4498, 'eval_samples_per_second': 91.642, 'eval_steps_per_second': 2.963, 'epoch': 0.81}


In [23]:
pretrain_model.push_adapter_to_hub(
    pretrained_adapter_hub_name,
    pretrained_adapter_name,
    datasets_tag=pretrain_dataset_name
)

pytorch_model_head.bin:   0%|          | 0.00/157M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_adapter.bin:   0%|          | 0.00/3.60M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/BigTMiami/AA_seq_bn_P_micro/commit/c864dc06dce5acfb2c2d5f3c89829a96fd13718b', commit_message='Upload model', commit_description='', oid='c864dc06dce5acfb2c2d5f3c89829a96fd13718b', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
import gc

print_gpu_utilization()

del pretraining_data_collator
del pretrain_train_dataset
del pretrain_eval_dataset
del pretrain_model
del pretrain_trainer
gc.collect()

print_gpu_utilization()

GPU 27307 MB
GPU 27307 MB


# Classification - Shared Setup

In [25]:
# THIS MUST BE SET TO MATCH  classification_adapter_type
from adapters import SeqBnConfig

classification_adapter_config = SeqBnConfig()

In [26]:
print(classification_adapter_config.__class__.__name__)
print(classification_adapter_type)

SeqBnConfig
seq_bn


In [27]:
from transformers import DataCollatorWithPadding

classification_data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Classification

In [28]:
from datasets import load_dataset

classification_train_dataset, classification_eval_dataset= load_dataset(classification_dataset_name, split=[classification_train_split,classification_eval_split])

print(classification_train_dataset)
print(classification_eval_dataset)

Downloading readme:   0%|          | 0.00/613 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/40.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 1153
})
Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 5000
})


In [29]:
from transformers import RobertaConfig
from adapters import AutoAdapterModel

config = RobertaConfig.from_pretrained("roberta-base")
classficiation_no_pretrain_model = AutoAdapterModel.from_pretrained(
    "roberta-base",
    config=config,
)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Add a new adapter
classficiation_no_pretrain_model.add_adapter(classification_adapter_name, config=classification_adapter_config)

# Add head for classification modeling
classficiation_no_pretrain_model.add_classification_head(
    classification_adapter_name,
    num_labels=2,
    id2label={ 0: "unhelpful", 1: "helpful"})

# Set the adapter to be used for training
classficiation_no_pretrain_model.train_adapter(classification_adapter_name)

print(f"Classification Adapter Name: {classification_adapter_name}")

Classification Adapter Name: AA_seq_bn_C_micro_A


In [31]:
classficiation_no_pretrain_model.active_adapters

Stack[AA_seq_bn_C_micro_A]

In [32]:
summary = classficiation_no_pretrain_model.adapter_summary()
print(summary)

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
AA_seq_bn_C_micro_A      bottleneck          894,528       0.718       1       1
--------------------------------------------------------------------------------
Full model                               124,645,632     100.000               0


In [33]:
from transformers import TrainingArguments

classification_training_args = TrainingArguments(
    output_dir=classification_adapter_hub_name,
    learning_rate=1e-4,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # Logging and eval during training settings
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=1000,
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

In [34]:
from adapters import AdapterTrainer

classification_trainer = AdapterTrainer(
    model=classficiation_no_pretrain_model,
    args=classification_training_args,
    train_dataset=classification_train_dataset,
    eval_dataset=classification_eval_dataset,
    data_collator=classification_data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [35]:
classification_training_results = classification_trainer.train()
print(classification_training_results)

Step,Training Loss,Validation Loss


TrainOutput(global_step=219, training_loss=0.43193726561385204, metrics={'train_runtime': 55.6282, 'train_samples_per_second': 62.181, 'train_steps_per_second': 3.937, 'total_flos': 869421254940720.0, 'train_loss': 0.43193726561385204, 'epoch': 3.0})


In [36]:
classification_eval_results = classification_trainer.evaluate()
classification_eval_loss = classification_eval_results["eval_loss"]
classification_eval_f1 = classification_eval_results["eval_f1_macro"]
print(f"Classification no Pretraining Eval F1:{100.0 * classification_eval_f1:.2f}")
print(f"Classification no Pretraining Eval Loss: {classification_eval_loss:.2f}")
print(classification_eval_results)

Classification no Pretraining Eval F1:46.05
Classification no Pretraining Eval Loss: 0.38
{'eval_loss': 0.38456642627716064, 'eval_accuracy': 0.8534, 'eval_f1_macro': 0.46045106291140603, 'eval_runtime': 37.8037, 'eval_samples_per_second': 132.262, 'eval_steps_per_second': 8.28, 'epoch': 3.0}


In [37]:
classficiation_no_pretrain_model.push_adapter_to_hub(
    classification_adapter_hub_name,
    classification_adapter_name,
    datasets_tag=classification_dataset_name
)
print(f"Pushed {classification_adapter_hub_name}")

pytorch_model_head.bin:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

pytorch_adapter.bin:   0%|          | 0.00/3.60M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

Pushed AA_seq_bn_C_micro


In [38]:
import gc

print_gpu_utilization()

del classficiation_no_pretrain_model
del classification_trainer
gc.collect()

print_gpu_utilization()

GPU 9987 MB
GPU 9987 MB


# Classification from Pre-training

In [39]:
from transformers import RobertaConfig
from adapters import AutoAdapterModel

config = RobertaConfig.from_pretrained("roberta-base")
classficiation_from_pretrain_model = AutoAdapterModel.from_pretrained(
    "roberta-base",
    config=config,
)

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['heads.default.3.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
# Load pre-trained adapter
load_name = f"BigTMiami/{pretrained_adapter_hub_name}"

# Load Pretrained adapter without head
loaded_adapter_name = classficiation_from_pretrain_model.load_adapter(load_name, with_head=False, set_active=True)

print(f"Loaded Pretrain Adapter Name: {load_name}")

(…)b/master/dist/v2/index/roberta-base.json:   0%|          | 0.00/540 [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

pytorch_adapter.bin:   0%|          | 0.00/3.60M [00:00<?, ?B/s]

head_config.json:   0%|          | 0.00/421 [00:00<?, ?B/s]

pytorch_model_head.bin:   0%|          | 0.00/157M [00:00<?, ?B/s]

Loaded Pretrain Adapter Name: BigTMiami/AA_seq_bn_P_micro


In [41]:
summary = classficiation_from_pretrain_model.adapter_summary()
print(summary)

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
AA_seq_bn_P_micro_A      bottleneck          894,528       0.718       1       1
--------------------------------------------------------------------------------
Full model                               124,645,632     100.000               1


In [42]:
# Add a new adapter
classficiation_from_pretrain_model.add_adapter(classification_adapter_after_pretrained_name, config=classification_adapter_config)

# Add head for classification modeling
classficiation_from_pretrain_model.add_classification_head(
    classification_adapter_after_pretrained_name,
    num_labels=2,
    id2label={ 0: "unhelpful", 1: "helpful"})

# Set the adapter to be used for training
classficiation_from_pretrain_model.train_adapter(classification_adapter_after_pretrained_name)

print(f"Classification after Pretrain Adapter Name: {classification_adapter_after_pretrained_name}")

Classification after Pretrain Adapter Name: AA_seq_bn_P_micro_seq_bn_C_micro_A


In [43]:
summary = classficiation_from_pretrain_model.adapter_summary()
print(summary)

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
AA_seq_bn_P_micro_A      bottleneck          894,528       0.718       0       0
AA_seq_bn_P_micro_seq_bn_C_micro_Abottleneck          894,528       0.718       1       1
--------------------------------------------------------------------------------
Full model                               124,645,632     100.000               0


In [44]:
#Set pretrained active, but don't include it in the train command
classficiation_from_pretrain_model.set_active_adapters([loaded_adapter_name, classification_adapter_after_pretrained_name])

In [45]:
summary = classficiation_from_pretrain_model.adapter_summary()
print(summary)

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
AA_seq_bn_P_micro_A      bottleneck          894,528       0.718       1       0
AA_seq_bn_P_micro_seq_bn_C_micro_Abottleneck          894,528       0.718       1       1
--------------------------------------------------------------------------------
Full model                               124,645,632     100.000               0


In [46]:
from transformers import TrainingArguments

classification_from_pretrained_training_args = TrainingArguments(
    output_dir=classification_adapter_after_pretrained_hub_name,
    learning_rate=1e-4,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # Logging and eval during training settings
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=1000,
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

In [47]:
from adapters import AdapterTrainer

classification_from_pretrained_trainer = AdapterTrainer(
    model=classficiation_from_pretrain_model,
    args=classification_from_pretrained_training_args,
    train_dataset=classification_train_dataset,
    eval_dataset=classification_eval_dataset,
    data_collator=classification_data_collator,
    compute_metrics=compute_metrics,
)
print_gpu_utilization()

GPU 9987 MB


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [48]:
classification_from_pretrained_training_results = classification_from_pretrained_trainer.train()
print(classification_from_pretrained_training_results)
print_gpu_utilization()

Step,Training Loss,Validation Loss


TrainOutput(global_step=219, training_loss=0.4367390114422803, metrics={'train_runtime': 57.6404, 'train_samples_per_second': 60.01, 'train_steps_per_second': 3.799, 'total_flos': 878281590561840.0, 'train_loss': 0.4367390114422803, 'epoch': 3.0})
GPU 10305 MB


In [49]:
classification_from_pretrained_eval_results = classification_from_pretrained_trainer.evaluate()
classification_from_pretrained_eval_loss = classification_from_pretrained_eval_results["eval_loss"]
classification_from_pretrained_eval_f1 = classification_from_pretrained_eval_results["eval_f1_macro"]
print(f"Classification with Pretraining Eval F1:{100.0 * classification_from_pretrained_eval_f1:.2f}")
print(f"Classification with Pretraining Eval Loss: {classification_from_pretrained_eval_loss:.2f}")
print(classification_from_pretrained_eval_results)
print_gpu_utilization()

Classification with Pretraining Eval F1:46.05
Classification with Pretraining Eval Loss: 0.39
{'eval_loss': 0.3894331455230713, 'eval_accuracy': 0.8534, 'eval_f1_macro': 0.46045106291140603, 'eval_runtime': 39.194, 'eval_samples_per_second': 127.571, 'eval_steps_per_second': 7.986, 'epoch': 3.0}
GPU 10305 MB


In [50]:
classficiation_from_pretrain_model.push_adapter_to_hub(
    classification_adapter_after_pretrained_hub_name,
    classification_adapter_after_pretrained_name,
    datasets_tag=classification_dataset_name
)
print(f"Pushed {classification_adapter_after_pretrained_hub_name}")

pytorch_adapter.bin:   0%|          | 0.00/3.60M [00:00<?, ?B/s]

pytorch_model_head.bin:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

Pushed AA_seq_bn_P_micro_seq_bn_C_micro


In [51]:
print_gpu_utilization()

GPU 10305 MB


In [52]:
end_time = time()
total_time = end_time - start_time

In [53]:
print("Summary")
print("----------------")
print(f"Pretraining Adapter Type:{pretraining_adapter_type}")
print(f"Pretrain Overall Train Percentage:{pretrain_percentage_final}")
print("")
print(f"Classification Adapter Type:{classification_adapter_type}")
print(f"Classification Train Percentage:{classfication_percentage_final}")
print("")
print("Running Time")
print("----------------")
print(f"Total Time: {total_time:.0f} seconds OR {total_time / 60.0 :.0f} minutes OR {total_time / 3600.0 :.1f} hours ")
print("")
print("Pre-training Loss Change - should be high because adapter is completely random")
print("----------------")
print(f"Before pretraining loss: {before_pretrain_eval_results_loss:.4f}")
print(f"After pretraining loss: {after_pretrain_eval_results_loss:.4f}")
print("")
print("Classification Loss Change - should not be that different because both are starting from a blank classifier")
print("----------------")
print(f"Classification no Pretraining Eval Loss: {classification_eval_loss:.4f}")
print(f"Classification with Pretraining Eval Loss: {classification_from_pretrained_eval_loss:.4f}")
print("")
print("Classification F1 Change - hopefully we get a few points improvement here!")
print("----------------")
print(f"Classification no Pretraining Eval F1:{100.0 * classification_eval_f1:.2f}")
print(f"Classification with Pretraining Eval F1:{100.0 * classification_from_pretrained_eval_f1:.2f}")

print("")
pretraining_f1_improvement = (100.0 * (  classification_from_pretrained_eval_f1 - classification_eval_f1))
print(f"Pretraining F1 Improvement: {pretraining_f1_improvement:.2f}")

Summary
----------------
Pretraining Adapter Type:seq_bn
Pretrain Overall Train Percentage:micro

Classification Adapter Type:seq_bn
Classification Train Percentage:micro

Running Time
----------------
Total Time: 405 seconds OR 7 minutes OR 0.1 hours 

Pre-training Loss Change - should be high because adapter is completely random
----------------
Before pretraining loss: 17.8105
After pretraining loss: 16.2281

Classification Loss Change - should not be that different because both are starting from a blank classifier
----------------
Classification no Pretraining Eval Loss: 0.3846
Classification with Pretraining Eval Loss: 0.3894

Classification F1 Change - hopefully we get a few points improvement here!
----------------
Classification no Pretraining Eval F1:46.05
Classification with Pretraining Eval F1:46.05

Pretraining F1 Improvement: 0.00


In [54]:
print("Disconnecting Session")
from google.colab import runtime
runtime.unassign()

Disconnecting Session
