In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q peft bitsandbytes accelerate datasets tensorboardX loralib
!pip install -q --upgrade git+https://github.com/huggingface/transformers

In [9]:
# Purpose of notebook: fine-tune LongT5 on exctracted sentences from studies, but using LoRA and bitsandbytes quantization

import os
import pickle
from pprint import pprint
import gc

import pandas as pd
from datasets import Dataset, load_dataset
from transformers import (
    LongT5ForConditionalGeneration,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import bitsandbytes as bnb
import torch
import numpy as np

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device.")
    os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = "0.0"
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device.")
    max_split_size_mb = 256  # Set the max_split_size_mb value (e.g., 512 MB)
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{max_split_size_mb}"
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
else:
    device = torch.device("cpu")
    print("MPS/CUDA not available. Using CPU.")

Using MPS device.


In [14]:
# Load tokenizer and model
model_id = 'pszemraj/long-t5-tglobal-base-16384-book-summary'

# Local
output_dir = "training_history"
extracted_file_path = '../experiment_1/biobert_extractive_only_training_dataset.csv.gz'
source_data_path = "data"

# Colab
# output_dir = "/content/drive/MyDrive/266 final project/notebooks/peft_training_history"
# extracted_file_path = '/content/drive/MyDrive/266 final project/notebooks/biobert_extractive_only_training_dataset.csv.gz'
# source_data_path = "/content/drive/MyDrive/266 final project/notebooks/peft_training_history/data"

# longT5 max token length is 16384, let's 1/2 that
max_input_token_length = 8192

# END CONFIG

train_data_path = os.path.join(source_data_path, 'train_tokenized_dataset')
val_data_path = os.path.join(source_data_path, 'val_tokenized_dataset')

if os.path.exists(train_data_path) and os.path.exists(val_data_path):
    train_dataset = Dataset.load_from_disk(train_data_path)
    val_dataset = Dataset.load_from_disk(val_data_path)

else:
    ms2_dataset = load_dataset("allenai/mslr2022", "ms2", split="train")

    # Load your CSV file
    df = pd.read_csv(extracted_file_path, compression='gzip')

    # # ---- not available yet. in the meantime:
    # all_extracted_summaries = []
    # for fpath in os.listdir('../experiment_1/biobert_extractive_only_training_dataset'):
    #     all_extracted_summaries.append(
    #         pickle.load(open(os.path.join('../experiment_1/biobert_extractive_only_training_dataset', fpath), 'rb'))
    #     )
    # df = pd.DataFrame(all_extracted_summaries)
    # # ----

    target_texts = ms2_dataset['target']
    input_texts = [
        df[df['review_id'] == int(i)]['summary'].tolist()[0] for i in ms2_dataset['review_id']
    ]
    dataset = Dataset.from_dict({'input_text': input_texts, 'target_text': target_texts})

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Tokenize data
    def tokenize_function(examples):
        model_inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=max_input_token_length)
        labels = tokenizer(text_target=examples['target_text'], padding='max_length', truncation=True, max_length=256)
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

    tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["input_text", "target_text"])
    print(f"Keys of tokenized dataset: {list(tokenized_datasets.features)}")

    label_pad_token_id = -100
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model_id,
        label_pad_token_id=label_pad_token_id,
    )

    # Split the dataset
    shuffle_dataset = tokenized_datasets.shuffle(seed=42)
    shuffle_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    train_dataset = shuffle_dataset.select(range(len(tokenized_datasets) * 8 // 10))
    val_dataset = shuffle_dataset.select(range(len(tokenized_datasets) * 8 // 10, len(tokenized_datasets)))

    # save to disk for easy loading
    train_dataset.save_to_disk(train_data_path)
    val_dataset.save_to_disk(val_data_path)

print(train_dataset.num_rows)
print(val_dataset.num_rows)

type(shuffle_dataset["input_ids"][0])

  table = cls._concat_blocks(blocks, axis=0)


11350
2838


torch.Tensor

In [36]:
# ANALYSIS: what's the distribution of non-padding tokens in train_dataset["labels"]?
all_tokens = train_dataset["labels"].numpy()
non_pad_token_counts = np.array([len(np.where(tokens != label_pad_token_id)[0]) for tokens in all_tokens])
# distribution of non_pad_token_counts
display(pd.Series(non_pad_token_counts).describe())

# what's the 95% percentile?
print("95% percentile is", np.percentile(non_pad_token_counts, 95))

count    11350.000000
mean        86.599912
std         57.467615
min          9.000000
25%         44.000000
50%         70.000000
75%        114.000000
max        256.000000
dtype: float64

95% percentile is 214.0


In [37]:
# ANALYSIS: what's the distribution of non-padding tokens in train_dataset["input_ids"]?
all_tokens = train_dataset["input_ids"].numpy()
non_pad_token_counts = np.array([len(np.where(tokens != 0)[0]) for tokens in all_tokens])
# distribution of non_pad_token_counts
display(pd.Series(non_pad_token_counts).describe())

# what's the 95% percentile?
print("95% percentile is", np.percentile(non_pad_token_counts, 95))

# which percentile is "8192 non-padding tokens" on?
print(
    "If we truncated input_ids to 8192, this is the percentile it'll be at (anything at a higher percentile could risk losing information):",
    (perc_8192 := pd.Series(non_pad_token_counts).rank(pct=True)[np.where(non_pad_token_counts <= 8192)[0]].max())
)
# confirm
print(np.percentile(non_pad_token_counts, perc_8192 * 100))

count    11350.000000
mean      3661.722291
std       2308.642882
min         71.000000
25%       1855.000000
50%       3050.000000
75%       5047.000000
max       8192.000000
dtype: float64

95% percentile is 8192.0
If we truncated input_ids to 8192, this is the percentile it'll be at (anything at a higher percentile could risk losing information): 0.9473568281938326
8192.0


In [21]:
# bitsandbytes
# Source notebooks:
# - https://colab.research.google.com/drive/1Vvju5kOyBsDr7RX_YAvp6ZsSOoSMjhKD?usp=sharing#scrollTo=E0Nl5mWL0k2T
# - https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing#scrollTo=HOWcL0LU3JYt

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    # load_in_8bit=True,
)

base_model = LongT5ForConditionalGeneration.from_pretrained(model_id)
model = LongT5ForConditionalGeneration.from_pretrained(
    model_id,
    # quantization_config=bnb_config,  # enable when in CUDA
    # load_in_8bit=True,
    # device_map="auto",
)

# BUG: `model` has its embeddings reinitiated. Copy over from `base_model` but retain data type
# reinited_params = ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
# for param_name in reinited_params:
#     model_param = model.get_parameter(param_name)
#     base_model_param = base_model.get_parameter(param_name)
#     model_param.data = base_model_param.data.to(model_param.dtype)

# use PEFT LoRA

lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    # target_modules=["q", "v", "k"],
    # target_modules=["q", "v"],
    target_modules=["q"],
    lora_dropout=0.05,
    bias="none",
)
model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model)  # enable for 4bit or 8bit quantization
model.enable_input_require_grads()
model = get_peft_model(model, lora_config)
# Fix from this GitHub issue: https://github.com/huggingface/peft/issues/522#issuecomment-1705989330
model.base_model.model.encoder.enable_input_require_grads()
model.base_model.model.decoder.enable_input_require_grads()
model.train()
model.print_trainable_parameters()

# Training arguments
logpath = os.path.join(output_dir, "longt5-qlora", "logs")

# training_args = TrainingArguments(
#     output_dir=os.path.join(output_dir, "longt5-qlora"),
#     num_train_epochs=1,
#     per_device_train_batch_size=2,  # Adjust batch size according to memory constraints
#     evaluation_strategy="steps",  # or, "epoch" ?
#     save_steps=500,
#     eval_steps=500,
#     max_steps=100,  # For debugging
#     learning_rate=1e-4,
#     logging_dir=os.path.join(output_dir, "longt5-qlora", "logs"),
#     logging_steps=50,
#     # # for 4bit or 8bit quantization
#     # fp16=True,
#     # optim="paged_adamw_8bit",  # default: adamw_torch
# )

training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(output_dir, "longt5-qlora"),
    evaluation_strategy="steps",  # alternatively, "epoch"
    learning_rate=1e-4,
    logging_dir=logpath,
    report_to="tensorboard",
    save_strategy="steps",

    # FOR REAL TRAINING
    # num_train_epochs=3,
    # auto_find_batch_size=True,
    # eval_steps=500,
    # logging_steps=100,
    # save_steps=200,

    # FOR DEBUGGING
    num_train_epochs=1,
    per_device_train_batch_size=2,
    eval_steps=2,
    max_steps=10,
    logging_steps=5,
    save_steps=5,

    # FOR 4BIT OR 8BIT QUANTIZATION
    # fp16=True,
    # optim="paged_adamw_8bit",  # default: adamw_torch
)

print("Tensorboard log path:", logpath)
print("run this in terminal: tensorboard --logdir", logpath)

# Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
# )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,
    # select random subset of val_dataset for evaluation. shuffle deterministically
    eval_dataset=val_dataset.shuffle(seed=42).select(range(100)),
    tokenizer=tokenizer,
    data_collator=data_collator,
)

model.config.use_cache = False

trainable params: 884,736 || all params: 248,472,192 || trainable%: 0.3560704289999583
Tensorboard log path: training_history/longt5-qlora/logs
run this in terminal: tensorboard --logdir training_history/longt5-qlora/logs


In [None]:
# get dtype of all parameters, get unique types
param_dtypes = [param.dtype for param in model.parameters()]
unique_dtypes = np.unique(param_dtypes)

In [16]:
model.get_parameter("encoder.embed_tokens.weight")

Parameter containing:
tensor([[-0.5561,  0.4233,  0.8544,  ..., -0.9618,  0.6647,  0.9398],
        [ 0.4269,  1.6681,  4.5766,  ..., -2.2274, -0.5151,  2.1782],
        [-5.4195, -2.4177, -0.8740,  ..., -0.2788, -1.3139, -1.5880],
        ...,
        [ 1.5533,  0.5635,  1.6218,  ...,  1.9036,  0.7348,  0.1447],
        [ 0.2494,  0.8528, -0.6396,  ...,  0.1166, -1.1269,  0.8604],
        [ 0.8795, -0.3369, -1.7056,  ...,  0.4987,  1.2487,  0.6472]],
       device='mps:0')

In [17]:
model.get_parameter("encoder.block.0.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight")

Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='mps:0', requires_grad=True)

In [45]:
# torch.cuda.empty_cache()  # Colab
torch.mps.empty_cache()  # MPS

model.device
# model.hf_device_map  # enable for 4bit or 8bit quantization

device(type='mps', index=0)

In [6]:
# try inferring for a single example
id_to_choose = 1
inputs = tokenizer(dataset[id_to_choose]['input_text'], return_tensors='pt').to(device)
labels = tokenizer(dataset[id_to_choose]['target_text'], return_tensors='pt').to(device)

In [None]:
# output = base_model.generate(**inputs, max_new_tokens=256, num_beams=4)
output = model.generate(**inputs, max_new_tokens=256, num_beams=4)
# output = trainer.model.generate(**inputs, max_new_tokens=256, num_beams=4)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(dataset[id_to_choose]["target_text"])

In [7]:
call_outputs = model(**inputs, labels=labels['input_ids'])



check_backward_validity
len(inputs) 11
tensor torch.Size([1, 1006, 768]) torch.float32 mps:0
tensor torch.Size([1, 1006]) torch.int64 mps:0
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor bool
non-tensor bool
end check_backward_validity


  x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value)


check_backward_validity
len(inputs) 11
tensor torch.Size([1, 1006, 768]) torch.float32 mps:0
tensor torch.Size([1, 1006]) torch.int64 mps:0
tensor torch.Size([1, 8, 12, 128, 446]) torch.float32 mps:0
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor bool
non-tensor bool
end check_backward_validity
check_backward_validity
len(inputs) 11
tensor torch.Size([1, 1006, 768]) torch.float32 mps:0
tensor torch.Size([1, 1006]) torch.int64 mps:0
tensor torch.Size([1, 8, 12, 128, 446]) torch.float32 mps:0
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor bool
non-tensor bool
end check_backward_validity
check_backward_validity
len(inputs) 11
tensor torch.Size([1, 1006, 768]) torch.float32 mps:0
tensor torch.Size([1, 1006]) torch.int64 mps:0
tensor torch.Size([1, 8, 12, 128, 446]) torch.float32 mps:0
non-tensor NoneType
non-tensor NoneType




check_backward_validity
len(inputs) 11
tensor torch.Size([1, 228, 768]) torch.float32 mps:0
tensor torch.Size([1, 1, 228, 228]) torch.float32 mps:0
tensor torch.Size([1, 12, 228, 228]) torch.float32 mps:0
tensor torch.Size([1, 1006, 768]) torch.float32 mps:0
tensor torch.Size([1, 1, 1, 1006]) torch.float32 mps:0
tensor torch.Size([1, 12, 228, 1006]) torch.float32 mps:0
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor bool
non-tensor bool
end check_backward_validity
check_backward_validity
len(inputs) 11
tensor torch.Size([1, 228, 768]) torch.float32 mps:0
tensor torch.Size([1, 1, 228, 228]) torch.float32 mps:0
tensor torch.Size([1, 12, 228, 228]) torch.float32 mps:0
tensor torch.Size([1, 1006, 768]) torch.float32 mps:0
tensor torch.Size([1, 1, 1, 1006]) torch.float32 mps:0
tensor torch.Size([1, 12, 228, 1006]) torch.float32 mps:0
non-tensor NoneType
non-tensor NoneType
non-tensor NoneType
non-tensor bool
non-tensor bool
end check_backward_validity
check_backward_v

In [8]:
call_outputs.loss

tensor(3.5540, device='mps:0')

In [147]:
one_sample_input_ids = train_dataset["input_ids"][:1].to(device)
one_sample_attention_mask = train_dataset["attention_mask"][:1].to(device)

torch.Size([1, 16384])

In [None]:
outputs = model(one_sample_input_ids, attention_mask=one_sample_attention_mask, return_dict=True)
logits = outputs.logits

In [6]:
for name, param in trainer.model.named_parameters():
    print(name, param.requires_grad)

base_model.model.shared.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.k.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.v.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.o.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.relative_attention_bias.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.global_relative_attention_bias.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.global_input_layer_norm.weight False
base_model.model.encoder.block.0.layer.0.layer_norm.weight False
base_model.model.encoder.block.0

In [9]:
def find_tensor_without_grad_fn(model):
    for name, param in model.named_parameters():
        if param.requires_grad and param.grad is None:
            print(f"Tensor without grad_fn: {name}")

find_tensor_without_grad_fn(trainer.model)

Tensor without grad_fn: base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight
Tensor without grad_fn: base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight
Tensor without grad_fn: base_model.model.encoder.block.1.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight
Tensor without grad_fn: base_model.model.encoder.block.1.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight
Tensor without grad_fn: base_model.model.encoder.block.2.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight
Tensor without grad_fn: base_model.model.encoder.block.2.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight
Tensor without grad_fn: base_model.model.encoder.block.3.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight
Tensor without grad_fn: base_model.model.encoder.block.3.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight
Tensor without grad_fn: base_model.model.encoder.block.4

In [None]:
# (If needed) Load model from checkpoint
latest_checkpoint = max([int(f.split('-')[1]) for f in os.listdir(os.path.join(output_dir, checkpoint_path)) if f.startswith('checkpoint')])
if latest_checkpoint:
    resume_from_checkpoint = os.path.join(output_dir, checkpoint_path, f"checkpoint-{latest_checkpoint}")
    print("Resuming from checkpoint:", resume_from_checkpoint)
else:
    resume_from_checkpoint = None

In [22]:
# Train the model
trainer.train()

  0%|          | 0/50 [00:00<?, ?it/s]



In [33]:
# view results
trainer.evaluate()



  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 11.186555862426758,
 'eval_runtime': 21.3928,
 'eval_samples_per_second': 1.215,
 'eval_steps_per_second': 0.187,
 'epoch': 3.0}

In [None]:
# Test the model on the same example
id_to_choose = 1
inputs = tokenizer(dataset[id_to_choose]['input_text'], return_tensors='pt').to(device)
output = model.generate(**inputs, max_new_tokens=256, num_beams=4)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(dataset[id_to_choose]["target_text"])

In [62]:
# Save model
trainer.save_model(os.path.join(output_dir, "longt5-qlora-final"))