# Import packages

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -U adapter-transformers
! pip install evaluate
!pip install -U --user neptune transformers datasets evaluate torch scipy scikit-learn numpy

In [None]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, TrainerCallback, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AdapterTrainer, EvalPrediction
from datasets import load_dataset, concatenate_datasets, load_from_disk, load_metric, Dataset, ClassLabel
import numpy as np
import pandas as pd
import os
import neptune
from getpass import getpass

In [None]:
import os
from getpass import getpass
project = 'xxx'
os.environ["NEPTUNE_API_TOKEN"] = getpass("Enter your Neptune API token: ")
os.environ["NEPTUNE_PROJECT"] = project

Enter your Neptune API token: ··········


# Data Loading and Transformation

In [None]:
# Load data
df_total = pd.read_parquet('/content/drive/MyDrive/Dissertation/Data/df.parquet')

In [None]:
# One-shot sample
df_label_unique_sample = df_total.groupby('label_cat', group_keys=False).apply(lambda df: df.sample(1))
# Create label mapping
label = list(df_label_unique_sample['label'])
id = list(df_label_unique_sample['label_cat'])
label_to_id = dict(zip(label,id))
id_to_label = dict(zip(id,label))

In [None]:
# Define mapping
Mapping = ClassLabel(names= label)

# Load dataset
# Load the parquet dataset
dataset = load_dataset("parquet", data_files={'train': '/content/drive/MyDrive/Dissertation/Data/train_df.parquet', 'test': '/content/drive/MyDrive/Dissertation/Data/test_df.parquet'})

Downloading and preparing dataset parquet/default to /root/.cache/huggingface/datasets/parquet/default-c72183ae075bf112/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/default-c72183ae075bf112/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Rename columns to the suitable column names for modelling
test_ds = dataset['test'].remove_columns('__index_level_0__')
train_ds = dataset['train'].remove_columns('__index_level_0__')
train_ds = train_ds.rename_column("processed_text", "text")
train_ds = train_ds.rename_column("label_cat", "labels")
test_ds = test_ds.rename_column("processed_text", "text")
test_ds= test_ds.rename_column("label_cat", "labels")

In [None]:
from transformers import AutoAdapterModel, list_adapters
adapter_infos = list_adapters(source="ah", model_name="bert-base-uncased")
for adapter_info in adapter_infos:
    print("Id:", adapter_info.adapter_id)
    print("Model name:", adapter_info.model_name)
    print("Uploaded by:", adapter_info.username)

https://raw.githubusercontent.com/Adapter-Hub/Hub/master/dist/v2/all.json not found in cache or force_download set to True, downloading to /content/~/.cache/torch/adapters/tmpu1h0ekkv


Downloading (…)ter/dist/v2/all.json:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

storing https://raw.githubusercontent.com/Adapter-Hub/Hub/master/dist/v2/all.json in cache at ~/.cache/torch/adapters/c797b993ce20003bc43c23b8485a092ae307a73e9a448e6a8733d7ef4afc3bab.34e3ac3db1913ce095c9dbfd04e08383971a5667bf115770a137d2c730a8505f
creating metadata file for ~/.cache/torch/adapters/c797b993ce20003bc43c23b8485a092ae307a73e9a448e6a8733d7ef4afc3bab.34e3ac3db1913ce095c9dbfd04e08383971a5667bf115770a137d2c730a8505f


Id: @ukp/bert-base-uncased-csqa_pfeiffer
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased_nli_rte_houlsby
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased_nli_rte_pfeiffer
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased_sts_qqp_pfeiffer
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased-sick_pfeiffer
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased_sts_qqp_houlsby
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased_qa_squad2_pfeiffer
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased-ner-pfeiffer
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased_nli_multinli_pfeiffer
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased-hellaswag_pfeiffer
Model name: bert-base-uncased
Uploaded by: ukp
Id: @ukp/bert-base-uncased_lingaccept_cola_houlsby
Model name: bert-base-uncased
Uploaded by:

# Modeling

## Define generirc functions

In [None]:
from transformers import DataCollatorWithPadding , DistilBertConfig, AutoModelForSequenceClassification, DistilBertModelWithHeads
# Function to transform and tokenize train and test data
def transform_token(train,test):
  model_name = 'distilbert-base-uncased' # Used for comparison purpose with the baseline model
  tokenizer_bert = AutoTokenizer.from_pretrained(model_name)
  # Encode the data
  def tokenize(dataset):
    return tokenizer_bert(dataset['text'], truncation=True)
  train_ds = train.map(tokenize, batched=True)
  test_ds = test.map(tokenize, batched=True)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer_bert) # Dynamic padding

  # Configuration
  config = DistilBertConfig.from_pretrained(
    model_name,
    num_labels=len(label),
)
  #Model
  model = DistilBertModelWithHeads.from_pretrained(
      model_name,
      config=config
  )
  return train_ds, test_ds, model

## Baseline model

In [None]:
# Create train, test dataset for the baseline trainiing
train_baseline,test_baseline, model_baseline =  transform_token(train_ds,test_ds)

In [None]:
# Add a new adapter
from transformers import AdapterConfig
model_baseline.add_adapter('baseline')
model_baseline.add_classification_head(
    "baseline",
    num_labels= len(label),
    id2label=id_to_label
  )
# Activate the adapter
model_baseline.train_adapter("baseline")

Adding adapter 'baseline'.
Adding head 'baseline' with config {'head_type': 'classification', 'num_labels': 31, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'Tenant_determine': 0, 'Remain_review_date': 1, 'Rent_Review_method': 2, 'Payable_period': 3, 'Rent_review_upward': 4, 'yield_up': 5, 'Full_reinstatement': 6, 'Tenant_repair_obligation': 7, 'Tenant_repair_decorate': 8, 'Structural_alteration': 9, 'non_structural_alteration': 10, 'Parties': 11, 'Premise': 12, 'Current Tenant': 13, 'Term_period': 14, 'Use': 15, 'Annual_rent': 16, 'Rent_Commence_date': 17, 'underlet_part': 18, 'underlet_whole': 19, 'Act_1954': 20, 'payable_deduction': 21, 'suspension_rent': 22, 'assignment': 23, 'Commence_date': 24, 'VAT': 25, 'Garantor': 26, 'service_proportion': 27, 'service_landlord': 28, 'uninsured_risk': 29, 'service_cap': 30}, 'use_pooler': False, 'bias': True}.


In [None]:
import numpy as np
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction
import evaluate
# Create a function to calculate Weighted f1 scores
f1_metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return f1_metric.compute(predictions=predictions, references=labels, average="weighted")

# Define functions to train the adapters
def train_adapter(train,test,model,output_directory):
  training_args = TrainingArguments(
    output_dir=output_directory,
    overwrite_output_dir=True,
    learning_rate=1e-4,
    num_train_epochs=20,
    evaluation_strategy = 'epoch',
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    report_to = "neptune",
    )
  tokenizer_bert = AutoTokenizer.from_pretrained('distilbert-base-uncased')
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer_bert) # Dynamic padding
  trainer = AdapterTrainer(
    model=model,
    args= training_args,
    train_dataset=train,
    eval_dataset=test,
    compute_metrics=compute_metrics,
    data_collator = data_collator,
    )
  trainer.train()

In [None]:
# Train the baseline adapter model
train_adapter(train_baseline, test_baseline, model_baseline, '/content/drive/MyDrive/Dissertation/adapter_transformer/baseline')

In [None]:
# Save the adapter
model_baseline.save_adapter("/content/drive/MyDrive/Dissertation/adapter_transformer/baseline", "baseline")

## Adapter fusion

In [None]:
train_fusion, test_fusion, model_fusion = transform_token(train_ds,test_ds)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapsh

In [None]:
# Load the pretrained adapters and a pretrained text classification adapter
model_fusion.load_adapter('/content/drive/MyDrive/Dissertation/adapter_transformer/baseline', load_as='baseline', with_head=False)
model_fusion.load_adapter("AdapterHub/bert-base-uncased-pf-multirc", load_as="multirc",source="hf",with_head=False)

Loading module configuration from /content/drive/MyDrive/Dissertation/adapter_transformer/baseline/adapter_config.json
Adding adapter 'baseline'.
Loading module weights from /content/drive/MyDrive/Dissertation/adapter_transformer/baseline/pytorch_adapter.bin


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Loading module configuration from /root/.cache/huggingface/hub/models--AdapterHub--bert-base-uncased-pf-multirc/snapshots/734a29c11b74b21f40a0c1abc6962def7b084184/adapter_config.json
Adding adapter 'multirc'.
Loading module weights from /root/.cache/huggingface/hub/models--AdapterHub--bert-base-uncased-pf-multirc/snapshots/734a29c11b74b21f40a0c1abc6962def7b084184/pytorch_adapter.bin
Some module weights could not be found in loaded weights file: transformer.layer.0.output_adapters.adapters.multirc.adapter_down.0.weight, transformer.layer.0.output_adapters.adapters.multirc.adapter_down.0.bias, transformer.layer.0.output_adapters.adapters.multirc.adapter_up.weight, transformer.layer.0.output_adapters.adapters.multirc.adapter_up.bias, transformer.layer.1.output_adapters.adapters.multirc.adapter_down.0.weight, transformer.layer.1.output_adapters.adapters.multirc.adapter_down.0.bias, transformer.layer.1.output_adapters.adapters.multirc.adapter_up.weight, transformer.layer.1.output_adapters.a

'multirc'

In [None]:
# Add a fusion layer for all loaded adapters
from transformers.adapters.composition import Fuse
model_fusion.add_adapter_fusion(Fuse("multirc", "baseline"))
model_fusion.set_active_adapters(Fuse("multirc", "baseline"))
# Add a classification head for our target task
model_fusion.add_classification_head(
    "Fusion",
    num_labels= len(label),
    id2label=id_to_label
  )

Adding AdapterFusion 'multirc,baseline'.
Could not identify valid prediction head(s) from setup 'Fuse[multirc, baseline]'.
Adding head 'Fusion' with config {'head_type': 'classification', 'num_labels': 31, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'Tenant_determine': 0, 'Remain_review_date': 1, 'Rent_Review_method': 2, 'Payable_period': 3, 'Rent_review_upward': 4, 'yield_up': 5, 'Full_reinstatement': 6, 'Tenant_repair_obligation': 7, 'Tenant_repair_decorate': 8, 'Structural_alteration': 9, 'non_structural_alteration': 10, 'Parties': 11, 'Premise': 12, 'Current Tenant': 13, 'Term_period': 14, 'Use': 15, 'Annual_rent': 16, 'Rent_Commence_date': 17, 'underlet_part': 18, 'underlet_whole': 19, 'Act_1954': 20, 'payable_deduction': 21, 'suspension_rent': 22, 'assignment': 23, 'Commence_date': 24, 'VAT': 25, 'Garantor': 26, 'service_proportion': 27, 'service_landlord': 28, 'uninsured_risk': 29, 'service_cap': 30}, 'use_pooler': False, 'bias': True}.


In [None]:
# Unfreeze and activate fusion setup
adapter_setup = Fuse("multirc", "baseline")
model_fusion.train_adapter_fusion(adapter_setup)

In [None]:
# Train the baseline adapter model
train_adapter(train_fusion, test_fusion, model_fusion, '/content/drive/MyDrive/Dissertation/adapter_transformer/fusion')

In [None]:
# Save fusion adapter
#save trainer
model_fusion.save_adapter_fusion("/content/drive/MyDrive/Dissertation/adapter_transformer/fusion", "multirc,baseline")
model_fusion.save_all_adapters("/content/drive/MyDrive/Dissertation/adapter_transformer/fusionr")

Configuration saved in /content/drive/MyDrive/Dissertation/adapter_transformer/fusion/adapter_fusion_config.json
Module weights saved in /content/drive/MyDrive/Dissertation/adapter_transformer/fusion/pytorch_model_adapter_fusion.bin
Configuration saved in /content/drive/MyDrive/Dissertation/adapter_transformer/fusionr/baseline/adapter_config.json
Module weights saved in /content/drive/MyDrive/Dissertation/adapter_transformer/fusionr/baseline/pytorch_adapter.bin
Configuration saved in /content/drive/MyDrive/Dissertation/adapter_transformer/fusionr/multirc/adapter_config.json
Module weights saved in /content/drive/MyDrive/Dissertation/adapter_transformer/fusionr/multirc/pytorch_adapter.bin


## IA3

In [None]:
# Load model
model_bert_IA3 = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                                    num_labels=len(label),id2label=id_to_label,
                                                                    label2id = label_to_id)

In [None]:
from transformers.adapters import IA3Config
config = IA3Config()
model_bert_IA3.add_adapter("ia3_adapter", config=config)

model_bert_IA3.set_active_adapters("ia3_adapter")

Adding adapter 'ia3_adapter'.


In [None]:
model_bert_IA3.train_adapter('ia3_adapter')

In [None]:
import numpy as np
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Dissertation/adapter_transformer/IA3',
    overwrite_output_dir=True,
    learning_rate=1e-4,
    weight_decay=0.01,
    num_train_epochs=6,
    evaluation_strategy = 'epoch',
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True)

trainer_IA3 = AdapterTrainer(
    model=model_bert_IA3,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
    tokenizer = tokenizer_bert,
    compute_metrics=compute_metrics)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer_IA3.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 261
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 102
  Number of trainable parameters = 642079
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,No log,3.309322,0.032727
2,No log,3.233877,0.032727
3,No log,3.178425,0.032727
4,No log,3.138346,0.032727
5,No log,3.113499,0.032727
6,No log,3.106144,0.032727


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Dissertation/adapter_transformer/IA3/checkpoint-17
Configuration saved in /content/drive/MyDrive/Dissertation/adapter_transformer/IA3/checkpoint-17/ia3_adapter/adapter_config.json
Module weights saved in /content/drive/MyDrive/Dissertation/adapter_transformer/IA3/checkpoint-17/ia3_adapter/pytorch_adapter.bin
Configuration saved in /content/drive/MyDrive/Dissertation/adapter_transformer/IA3/checkpoint-17/ia3_adapter/head_config.json
Module weights saved in /content/drive/MyDrive/Dissertation/adapter_transformer/IA3/checkpoint-17/ia3_adapter/pytorch_model_head.bin
tokenizer config file saved in /content/dr

TrainOutput(global_step=102, training_loss=3.193887149586397, metrics={'train_runtime': 220.6254, 'train_samples_per_second': 7.098, 'train_steps_per_second': 0.462, 'total_flos': 11787713431656.0, 'train_loss': 3.193887149586397, 'epoch': 6.0})