# PEFT on LongT5

Extractive-Abstractive Model.
- Extractive: KMeans clustering per study with 5 clusters, then choosing the sentence closest to each cluster's centroid (no overlapping).
- Abstractive: LongT5 model originally trained for summarization (BookSum). Applied parameter-efficient fine-tuning (LoRA: Low Rank Adaptation) with r = 16.

Note: this notebook contains code for using bitsandbytes, a library for 4-bit/8-bit quantization for memory-efficient training. This was ultimately NOT used in the final model, as we learned that bitsandbytes was not working well with LongT5 (and we simply don't have time to debug!)

Inspiration code for LoRA and QLoRA (LoRA + bit quantization):
- https://www.philschmid.de/fine-tune-flan-t5-peft
- https://blog.lancedb.com/optimizing-llms-a-step-by-step-guide-to-fine-tuning-with-peft-and-qlora-22eddd13d25b


<a href="https://colab.research.google.com/github/AmirMoazzami/266_final_proj/blob/mk%2Fkmeans-extraction-conclusion-only/experiment_5_peft/peft_kmeans_longt5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!nvidia-smi

Mon Dec  4 16:45:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install -q --upgrade git+https://github.com/huggingface/transformers
!pip install -q --upgrade git+https://github.com/huggingface/peft.git
!pip install -q bitsandbytes accelerate datasets tensorboardX loralib

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [1]:
# Purpose of notebook: fine-tune LongT5 on exctracted sentences from studies, but using LoRA and bitsandbytes quantization

import os
import pickle
from pprint import pprint
import gc

import pandas as pd
from datasets import Dataset, load_dataset
from transformers import (
    LongT5ForConditionalGeneration,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import bitsandbytes as bnb
import torch
import numpy as np

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device.")
    os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = "0.0"
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device.")
    max_split_size_mb = 256  # Set the max_split_size_mb value (e.g., 512 MB)
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{max_split_size_mb}"
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
else:
    device = torch.device("cpu")
    print("MPS/CUDA not available. Using CPU.")

Using CUDA device.


In [2]:
# Load tokenizer and model
model_id = 'pszemraj/long-t5-tglobal-base-16384-book-summary'
# output_dir = "training_history"
output_dir = "/content/drive/MyDrive/266 final project/notebooks/peft_training_history"  # Colab

# extracted_file_path = '../experiment_1/biobert_extractive_only_training_dataset.csv.gz'
extracted_file_path = '/content/drive/MyDrive/266 final project/notebooks/biobert_extractive_only_training_dataset.csv.gz'  # Colab

# source_data_path = "data"
source_data_path = "/content/drive/MyDrive/266 final project/notebooks/peft_training_history/data"  # Colab
# source_data_path = "/content/drive/MyDrive/266 final project/notebooks/peft_training_history/data_1024"  # Colab

# longT5 max token length is 16384, let's 1/2 that
max_input_token_length = 8192
# max_input_token_length = 1024

# -------- END CONFIG ----------

tokenizer = AutoTokenizer.from_pretrained(model_id)
# label_pad_token_id = tokenizer.pad_token_id
label_pad_token_id = -100  # special label token that gets ignored in loss calculations

train_data_path = os.path.join(source_data_path, 'train_tokenized_dataset')
val_data_path = os.path.join(source_data_path, 'val_tokenized_dataset')

if os.path.exists(train_data_path) and os.path.exists(val_data_path):
    train_dataset = Dataset.load_from_disk(train_data_path)
    val_dataset = Dataset.load_from_disk(val_data_path)

else:
    ms2_dataset = load_dataset("allenai/mslr2022", "ms2", split="train")

    # Load your CSV file
    df = pd.read_csv(extracted_file_path, compression='gzip')

    # # ---- if full extracted data is not available yet:
    # all_extracted_summaries = []
    # for fpath in os.listdir('../experiment_1/biobert_extractive_only_training_dataset'):
    #     all_extracted_summaries.append(
    #         pickle.load(open(os.path.join('../experiment_1/biobert_extractive_only_training_dataset', fpath), 'rb'))
    #     )
    # df = pd.DataFrame(all_extracted_summaries)
    # # ----

    target_texts = ms2_dataset['target']
    input_texts = [
        df[df['review_id'] == int(i)]['summary'].tolist()[0] for i in ms2_dataset['review_id']
    ]
    dataset = Dataset.from_dict({'input_text': input_texts, 'target_text': target_texts})

    # Tokenize data
    def tokenize_function(examples):
        model_inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=max_input_token_length)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(text_target=examples['target_text'], padding='max_length', truncation=True, max_length=256)
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else label_pad_token_id) for l in label] for label in labels["input_ids"]
            ]
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

    tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["input_text", "target_text"])
    print(f"Keys of tokenized dataset: {list(tokenized_datasets.features)}")

    # Split the dataset
    shuffle_dataset = tokenized_datasets.shuffle(seed=42)
    shuffle_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    train_dataset = shuffle_dataset.select(range(len(tokenized_datasets) * 8 // 10))
    val_dataset = shuffle_dataset.select(range(len(tokenized_datasets) * 8 // 10, len(tokenized_datasets)))

    # save to disk for easy loading
    train_dataset.save_to_disk(train_data_path)
    val_dataset.save_to_disk(val_data_path)

print(train_dataset["input_ids"].shape)
print(val_dataset["input_ids"].shape)
type(train_dataset["input_ids"][0])

torch.Size([11350, 8192])
torch.Size([2838, 8192])


torch.Tensor

In [None]:
# ANALYSIS: what's the distribution of non-padding tokens in train_dataset["input_ids"]?
all_tokens = train_dataset["input_ids"].numpy()
non_pad_token_counts = np.array([len(np.where(tokens != 0)[0]) for tokens in all_tokens])
# distribution of non_pad_token_counts
display(pd.Series(non_pad_token_counts).describe())

# what's the 95% percentile?
print("95% percentile is", np.percentile(non_pad_token_counts, 95))

# which percentile is "8192 non-padding tokens" on?
print(
    "If we truncated input_ids to 8192, this is the percentile it'll be at (anything at a higher percentile could risk losing information):",
    (perc_8192 := pd.Series(non_pad_token_counts).rank(pct=True)[np.where(non_pad_token_counts <= 8192)[0]].max())
)
# confirm
print(np.percentile(non_pad_token_counts, perc_8192 * 100))

count    11350.000000
mean      3661.722291
std       2308.642882
min         71.000000
25%       1855.000000
50%       3050.000000
75%       5047.000000
max       8192.000000
dtype: float64

95% percentile is 8192.0
If we truncated input_ids to 8192, this is the percentile it'll be at (anything at a higher percentile could risk losing information): 0.9473568281938326
8192.0


In [3]:
# bitsandbytes
# Source notebooks:
# - https://colab.research.google.com/drive/1Vvju5kOyBsDr7RX_YAvp6ZsSOoSMjhKD?usp=sharing#scrollTo=E0Nl5mWL0k2T
# - https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing#scrollTo=HOWcL0LU3JYt

checkpoint_path = "longt5-qlora"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    # load_in_8bit=True,
)

base_model = LongT5ForConditionalGeneration.from_pretrained(model_id)
model = LongT5ForConditionalGeneration.from_pretrained(
    model_id,
    # quantization_config=bnb_config,  # enable when in CUDA
    # device_map="auto",
)

# # BUG: `model` has its embeddings reinitiated. Copy over from `base_model` but retain data type
# reinited_params = ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
# for param_name in reinited_params:
#     model_param = model.get_parameter(param_name)
#     base_model_param = base_model.get_parameter(param_name)
#     model_param.data = (
#         base_model_param.data
#         .to(model_param.dtype)  # or, comment out to remain in 32-bit for accuracy
#         .to(device)
#     )

# use PEFT LoRA

lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    # target_modules=["q", "v", "k"],
    target_modules=["q", "v"],
    # target_modules=["q"],
    layers_to_transform=list(range(0, 12)),  # 11 is max layer
    lora_dropout=0.05,
    bias="none",
)
model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model)  # enable for 4bit or 8bit quantization
model.enable_input_require_grads()
model = get_peft_model(model, lora_config)
# Fix from this GitHub issue: https://github.com/huggingface/peft/issues/522#issuecomment-1705989330
model.base_model.model.encoder.enable_input_require_grads()
model.base_model.model.decoder.enable_input_require_grads()

model.train()
model.print_trainable_parameters()

# Training arguments
logpath = os.path.join(output_dir, checkpoint_path, "logs")

training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(output_dir, checkpoint_path),
    evaluation_strategy="steps",  # alternatively, "epoch"
    logging_strategy="steps",
    learning_rate=1e-3,
    logging_dir=logpath,
    report_to="tensorboard",
    save_strategy="steps",
    fp16=False,
    # predict_with_generate=True,

    # FOR REAL TRAINING
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # auto_find_batch_size=True,
    eval_steps=200,
    logging_steps=100,
    save_steps=100,
    log_level="info",

    # FOR DEBUGGING
    # num_train_epochs=1,
    # per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    # max_steps=20,
    # eval_steps=2,
    # logging_steps=2,  # should match eval_steps
    # save_steps=4,  # includes train loss metric
    # log_level="debug",

    # FOR 4BIT OR 8BIT QUANTIZATION
    # fp16=True,
    # optim="paged_adamw_8bit",  # default: adamw_torch
)

print("Tensorboard log path:", logpath)
print("run this in terminal: tensorboard --logdir", logpath)

# Initialize Trainer
model.config.use_cache = False

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    # model=model_id,
    # label_pad_token_id=label_pad_token_id,
    # pad_to_multiple_of=8,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset.shuffle(seed=42).select(range(200)),
    # eval_dataset=val_dataset.select(range(10, 20)),  # for debugging
    data_collator=data_collator,
    tokenizer=tokenizer,
)


trainable params: 1,769,472 || all params: 249,356,928 || trainable%: 0.7096141319161583
Tensorboard log path: /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora/logs
run this in terminal: tensorboard --logdir /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora/logs


In [4]:
base_model.get_parameter("encoder.embed_tokens.weight")

Parameter containing:
tensor([[-0.5561,  0.4233,  0.8544,  ..., -0.9618,  0.6647,  0.9398],
        [ 0.4269,  1.6681,  4.5766,  ..., -2.2274, -0.5151,  2.1782],
        [-5.4195, -2.4177, -0.8740,  ..., -0.2788, -1.3139, -1.5880],
        ...,
        [ 1.5533,  0.5635,  1.6218,  ...,  1.9036,  0.7348,  0.1447],
        [ 0.2494,  0.8528, -0.6396,  ...,  0.1166, -1.1269,  0.8604],
        [ 0.8795, -0.3369, -1.7056,  ...,  0.4987,  1.2487,  0.6472]],
       requires_grad=True)

In [5]:
model.get_parameter("encoder.embed_tokens.weight")

Parameter containing:
tensor([[-0.5561,  0.4233,  0.8544,  ..., -0.9618,  0.6647,  0.9398],
        [ 0.4269,  1.6681,  4.5766,  ..., -2.2274, -0.5151,  2.1782],
        [-5.4195, -2.4177, -0.8740,  ..., -0.2788, -1.3139, -1.5880],
        ...,
        [ 1.5533,  0.5635,  1.6218,  ...,  1.9036,  0.7348,  0.1447],
        [ 0.2494,  0.8528, -0.6396,  ...,  0.1166, -1.1269,  0.8604],
        [ 0.8795, -0.3369, -1.7056,  ...,  0.4987,  1.2487,  0.6472]],
       device='cuda:0')

In [6]:
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight"))
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight").shape)

Parameter containing:
tensor([[ 0.0092,  0.0045,  0.0233,  ..., -0.0029, -0.0240, -0.0316],
        [-0.0245,  0.0008, -0.0296,  ...,  0.0352,  0.0221,  0.0048],
        [ 0.0348,  0.0140, -0.0356,  ..., -0.0293,  0.0241,  0.0042],
        ...,
        [-0.0112, -0.0082, -0.0025,  ..., -0.0095, -0.0045,  0.0197],
        [-0.0340,  0.0227, -0.0244,  ...,  0.0126, -0.0216, -0.0190],
        [-0.0286, -0.0319,  0.0334,  ...,  0.0159,  0.0300,  0.0041]],
       device='cuda:0', requires_grad=True)
torch.Size([16, 768])


In [7]:
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight"))
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight").shape)

Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', requires_grad=True)
torch.Size([768, 16])


In [16]:
print(train_dataset["labels"].device)
model.device

cpu


device(type='cuda', index=0)

In [28]:
# # to reset memory
# del train_dataset, val_dataset, tokenizer
# del model, base_model, data_collator, trainer
# gc.collect()
# torch.cuda.empty_cache()  # Colab
# # torch.mps.empty_cache()  # MPS

In [15]:
# try inferring for a single example
id_to_choose = 1
base_model = base_model.to(device)
inputs = train_dataset[id_to_choose: id_to_choose + 1]
output = base_model.generate(
    input_ids=inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    max_new_tokens=128, num_beams=4,
)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(tokenizer.decode(train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 512,
  "min_length": 8,
  "no_repeat_ngram_size": 3,
  "num_beams": 2,
  "pad_token_id": 0,
  "repetition_penalty": 3.5
}

  true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
  local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)


('In this paper, the authors describe a study of safety and efficacy of two '
 'different local anesthesias in patients with hypertension. The aim of the '
 'study is to determine the effects of these two different anestheses on blood '
 'pressure before and after restorative tooth extraction. A total of sixty-two '
 'patients are included in this study. Sixty were assigned to receive either 2 '
 'p.Liocaine or 1 % Licocaine for local analization. After a single tooth '
 'extraction, both groups showed similar changes in heartbeat and blood '
 'pressure.')
('The most frequent complications in cardiovascular compromised patients after '
 'dental local anaesthesia with a vasoconstrictor agent were disclosed in ECG '
 'arrhythmias. Most of these disclosed arrhythmias were clinical ly '
 'insignificant. The use of  4 ampules of lignocaine with epinephrine 1:100000 '
 'as a dental anaesthetic seems to be relatively safe for cardiovascular '
 'compromised patients')


In [31]:
# try inferring for a single example
id_to_choose = 1
inputs = train_dataset[id_to_choose: id_to_choose + 1]
output = trainer.model.generate(
    input_ids=inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    max_new_tokens=128, num_beams=4,
)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(tokenizer.decode(train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id], skip_special_tokens=True))



('A prospective open-based, single-exaggerated study evaluating safety and '
 'effusiveness of local analin in the treatment of patients with severe '
 'hypertension. In this paper, blood pressures were measured after tooth '
 'extraction by a method employing two different concentrations of '
 'lignocenelilicous drugs: one at a 0.1ug / mL and the other as soon as the '
 'patient had been removed from the room. The results showed that there was no '
 'significant change in blood pressure or pulse rate between the two control '
 'groups.')
('The most frequent complications in cardiovascular compromised patients after '
 'dental local anaesthesia with a vasoconstrictor agent were disclosed in ECG '
 'arrhythmias. Most of these disclosed arrhythmias were clinical ly '
 'insignificant. The use of  4 ampules of lignocaine with epinephrine 1:100000 '
 'as a dental anaesthetic seems to be relatively safe for cardiovascular '
 'compromised patients')


In [78]:
call_outputs = model(
    inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    labels=train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id].unsqueeze(0).to(device),
)
print("Without padding tokens")
print(call_outputs.loss)
print(call_outputs.logits)

Without padding tokens
tensor(3.7706, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([[[-22.0120,  -7.3763,  -6.8542,  ..., -21.5278, -21.9906, -21.9159],
         [-28.0730,  -6.6089,  -6.8185,  ..., -27.5369, -28.1858, -27.8626],
         [-30.6885,  -6.5421, -10.9930,  ..., -30.0147, -30.4963, -30.3102],
         ...,
         [-28.4630,  -3.2069,  -7.9613,  ..., -27.7613, -28.3718, -28.3400],
         [-29.3955,  -1.9470,  -4.6935,  ..., -28.8900, -29.5536, -29.4921],
         [-29.7273,  -2.3630,  -7.3372,  ..., -29.1836, -29.8692, -29.5749]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)


In [79]:
call_outputs = model(
    inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    labels=train_dataset[id_to_choose]['labels'].unsqueeze(0).to(device),
)
print("With padding tokens in labels")
print(call_outputs.loss)
print(call_outputs.logits)

With padding tokens in labels
tensor(3.7639, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([[[-17.5048,  -3.3334,  -4.8966,  ..., -17.3239, -17.5362, -17.5236],
         [-29.3313,  -7.5968,  -6.0506,  ..., -28.8467, -29.3566, -29.1388],
         [-33.3023, -11.5635, -13.7315,  ..., -32.7689, -33.4021, -33.0907],
         ...,
         [-19.4768,   1.2223,  -2.6142,  ..., -19.1825, -19.5845, -19.5321],
         [-20.8250,  -2.5398,  -5.6711,  ..., -20.4851, -20.8054, -20.7826],
         [-15.5038,  -1.7199,  -3.8062,  ..., -15.2332, -15.5030, -15.4429]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)


### Some diagnostics

In [17]:
for name, param in trainer.model.named_parameters():
    print(name, param.requires_grad)

base_model.model.shared.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.base_layer.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.k.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.v.base_layer.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.v.lora_A.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.v.lora_B.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.o.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.relative_attention_bias.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.global_relative_attention_bi

In [18]:
trainer.model

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): LongT5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): LongT5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): LongT5Block(
            (layer): ModuleList(
              (0): LongT5LayerTransientGlobalSelfAttention(
                (TransientGlobalSelfAttention): LongT5TransientGlobalAttention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=768, bias=False)
                    )
   

### Training

In [8]:
# (If needed) Load model from checkpoint
latest_checkpoint = max([int(f.split('-')[1]) for f in os.listdir(os.path.join(output_dir, checkpoint_path)) if f.startswith('checkpoint')])
if latest_checkpoint:
    resume_from_checkpoint = os.path.join(output_dir, checkpoint_path, f"checkpoint-{latest_checkpoint}")
    print("Resuming from checkpoint:", resume_from_checkpoint)
else:
    resume_from_checkpoint = None

Resuming from checkpoint: /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora/checkpoint-2750


In [9]:
# Train the model
trainer.train(resume_from_checkpoint=True)

Loading model from /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora/checkpoint-2750.
***** Running training *****
  Num examples = 11,350
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,676
  Number of trainable parameters = 1,769,472
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 2750
  Will skip the first 1 epochs then the first 1331 batches in the first epoch.
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
  local_attention_mask = torch.logical_and(_blocked_attention_m

Step,Training Loss,Validation Loss
3000,2.8431,2.524143
4000,2.8195,2.502765
5000,2.7737,2.488067


  true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
  local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora/checkpoint-3000
tokenizer config file saved in /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora/checkpoint-3000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora/checkpoint-3000/special_tokens_map.json
  true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
  local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)
Saving model checkpoint to /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora/checkpoint-3250
tokenizer config file saved in /content/dri

TrainOutput(global_step=5676, training_loss=1.4375211676717896, metrics={'train_runtime': 18559.6412, 'train_samples_per_second': 2.446, 'train_steps_per_second': 0.306, 'total_flos': 4.516809378103296e+17, 'train_loss': 1.4375211676717896, 'epoch': 4.0})

In [13]:
# evaluate on custom slice of train dataset
trainer.evaluate(train_dataset.select(range(0, 10)))

***** Running Evaluation *****
  Num examples = 10
  Batch size = 8
  true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
  local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)


  true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
  local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)


{'eval_loss': 2.6070263385772705,
 'eval_runtime': 2.2912,
 'eval_samples_per_second': 4.365,
 'eval_steps_per_second': 0.873,
 'epoch': 2.0}

In [47]:
# view results
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 10
  Batch size = 1


{'eval_loss': 4.012439727783203,
 'eval_runtime': 11.0355,
 'eval_samples_per_second': 0.906,
 'eval_steps_per_second': 0.906,
 'epoch': 0.0}

In [15]:
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight"))
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight").shape)

Parameter containing:
tensor([[-0.0197,  0.0322, -0.0018,  ...,  0.0370, -0.0552,  0.0419],
        [ 0.0526, -0.0724, -0.0003,  ...,  0.0095, -0.0972,  0.0021],
        [ 0.0086,  0.0318,  0.0109,  ...,  0.0422,  0.0820,  0.0055],
        ...,
        [-0.0266,  0.0364,  0.0284,  ...,  0.0088,  0.0610,  0.0288],
        [ 0.0816, -0.0670,  0.0038,  ...,  0.0564, -0.0580, -0.0146],
        [-0.0146,  0.0368, -0.0205,  ..., -0.0566,  0.0114, -0.0238]],
       device='cuda:0', requires_grad=True)
torch.Size([16, 768])


In [16]:
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight"))
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight").shape)

Parameter containing:
tensor([[-0.0539, -0.0352,  0.0256,  ...,  0.0116, -0.0168, -0.0118],
        [ 0.0160, -0.0318,  0.0142,  ...,  0.0199, -0.0460, -0.0028],
        [ 0.0271, -0.0188,  0.0189,  ...,  0.0334, -0.0603, -0.0007],
        ...,
        [ 0.0061,  0.0071,  0.0259,  ..., -0.0034,  0.0119,  0.0226],
        [-0.0341, -0.0029, -0.0074,  ..., -0.0272,  0.0215,  0.0012],
        [-0.0263, -0.0530, -0.0163,  ..., -0.0220,  0.0195,  0.0521]],
       device='cuda:0', requires_grad=True)
torch.Size([768, 16])


In [19]:
# try inferring for a single example
id_to_choose = 1
inputs = train_dataset[id_to_choose: id_to_choose + 1]
output = trainer.model.generate(
    input_ids=inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    max_new_tokens=128,
    num_beams=4,
)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(tokenizer.decode(train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 512,
  "min_length": 8,
  "no_repeat_ngram_size": 3,
  "num_beams": 2,
  "pad_token_id": 0,
  "repetition_penalty": 3.5,
  "use_cache": false
}

  true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
  local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)


('There was no significant difference in blood pressure or pulse rate between '
 'the 2 groups. Conclusions This meta- analysis shows that there is no '
 'evidence to support the use of angiotenain 1:20,000 as a local anesthesia '
 'for tooth extraction')
('The most frequent complications in cardiovascular compromised patients after '
 'dental local anaesthesia with a vasoconstrictor agent were disclosed in ECG '
 'arrhythmias. Most of these disclosed arrhythmias were clinical ly '
 'insignificant. The use of  4 ampules of lignocaine with epinephrine 1:100000 '
 'as a dental anaesthetic seems to be relatively safe for cardiovascular '
 'compromised patients')


In [11]:
# try inferring for a single example
id_to_choose = 1
inputs = train_dataset[id_to_choose: id_to_choose + 1]
output = trainer.model.generate(
    input_ids=inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    max_new_tokens=128,
    num_beams=4,
)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(tokenizer.decode(train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 512,
  "min_length": 8,
  "no_repeat_ngram_size": 3,
  "num_beams": 2,
  "pad_token_id": 0,
  "repetition_penalty": 3.5,
  "use_cache": false
}

  true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
  local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)


('The results of this meta- analysis showed that the use of a single dose of '
 'lidocoine was associated with an increase in heart rate. However, there were '
 'no significant differences between these two groups on pulse rates and blood '
 'pressure.')
('The most frequent complications in cardiovascular compromised patients after '
 'dental local anaesthesia with a vasoconstrictor agent were disclosed in ECG '
 'arrhythmias. Most of these disclosed arrhythmias were clinical ly '
 'insignificant. The use of  4 ampules of lignocaine with epinephrine 1:100000 '
 'as a dental anaesthetic seems to be relatively safe for cardiovascular '
 'compromised patients')


In [12]:
call_outputs = model(
    inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    # labels=train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id].unsqueeze(0),
    labels=train_dataset[id_to_choose]['labels'].unsqueeze(0).to(device),
)
print("With padding tokens in labels")
print(call_outputs.loss)
print(call_outputs.logits)

With padding tokens in labels
tensor(2.8820, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([[[-16.2487,  -0.6006,  -1.9300,  ..., -15.8971, -16.3333, -16.1956],
         [-19.4892,  -3.5078,  -2.8381,  ..., -19.1383, -19.5005, -19.3040],
         [-16.9491,  -3.9170,  -5.3435,  ..., -16.6350, -16.9932, -16.8125],
         ...,
         [-15.2111,   0.2993,  -1.5088,  ..., -14.8921, -15.2493, -15.1783],
         [ -9.2678,   1.8246,   0.2085,  ...,  -9.0547,  -9.3041,  -9.3049],
         [-13.5367,   0.1113,  -1.5535,  ..., -13.3507, -13.5380, -13.5523]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)


In [10]:
# Save model
# trainer.save_model(os.path.join(output_dir, "longt5-qlora-final"))
final_save_dir = "longt5-qlora-4-epochs-final"
trainer.model.save_pretrained(os.path.join(output_dir, final_save_dir))
tokenizer.save_pretrained(os.path.join(output_dir, final_save_dir))

tokenizer config file saved in /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora-4-epochs-final/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora-4-epochs-final/special_tokens_map.json


('/content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora-4-epochs-final/tokenizer_config.json',
 '/content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora-4-epochs-final/special_tokens_map.json',
 '/content/drive/MyDrive/266 final project/notebooks/peft_training_history/longt5-qlora-4-epochs-final/tokenizer.json')