In [None]:
!pip install -q peft bitsandbytes accelerate

In [10]:
# Purpose of notebook: fine-tune LongT5 on exctracted sentences from studies, but using LoRA and bitsandbytes quantization

import os
import pickle
from pprint import pprint

import pandas as pd
from datasets import Dataset, load_dataset
from transformers import (
    LongT5ForConditionalGeneration,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import bitsandbytes as bnb
import torch
import numpy as np

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device.")
    os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = "0.0"
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device.")
    max_split_size_mb = 256  # Set the max_split_size_mb value (e.g., 512 MB)
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{max_split_size_mb}"
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
else:
    device = torch.device("cpu")
    print("MPS/CUDA not available. Using CPU.")

Using MPS device.


In [53]:
# Load tokenizer and model
model_id = 'pszemraj/long-t5-tglobal-base-16384-book-summary'
output_dir = "training_history"

# bitsandbytes
# Source notebooks:
# - https://colab.research.google.com/drive/1Vvju5kOyBsDr7RX_YAvp6ZsSOoSMjhKD?usp=sharing#scrollTo=E0Nl5mWL0k2T
# - https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing#scrollTo=HOWcL0LU3JYt

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

base_model = LongT5ForConditionalGeneration.from_pretrained(
    model_id,
    # quantization_config=bnb_config,  # enable when in CUDA
).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# freeze the model
for param in base_model.parameters():
    param.requires_grad = False

# use PEFT

# Load the config
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
)
base_model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(base_model)
model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

ms2_dataset = load_dataset("allenai/mslr2022", "ms2", split="train")

# Load your CSV file
df = pd.read_csv('../experiment_1/biobert_extractive_only_training_dataset.csv.gz', compression='gzip')

# # ---- not available yet. in the meantime:
# all_extracted_summaries = []
# for fpath in os.listdir('../experiment_1/biobert_extractive_only_training_dataset'):
#     all_extracted_summaries.append(
#         pickle.load(open(os.path.join('../experiment_1/biobert_extractive_only_training_dataset', fpath), 'rb'))
#     )
# df = pd.DataFrame(all_extracted_summaries)
# # ----

input_texts = df['summary'].tolist()

# target texts come from ms2 dataset. match on df's review_id for order
target_texts = [
    ms2_dataset[ms2_dataset['review_id'].index(str(i))]['target'] for i in df["review_id"]
]

# Tokenize data
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=512)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], padding='max_length', truncation=True, max_length=128)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

dataset = Dataset.from_dict({'input_text': input_texts, 'target_text': target_texts})
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the dataset
shuffle_dataset = tokenized_datasets.shuffle(seed=42)
train_dataset = shuffle_dataset.select(range(len(tokenized_datasets) * 8 // 10))
val_dataset = shuffle_dataset.select(range(len(tokenized_datasets) * 8 // 10, len(tokenized_datasets)))

# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(output_dir, "longt5-qlora"),
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Adjust batch size according to memory constraints
    evaluation_strategy="steps",  # or, "epoch" ?
    save_steps=500,
    eval_steps=500,
    learning_rate=1e-4,
    logging_dir=os.path.join(output_dir, "longt5-qlora", "logs"),
    logging_steps=50,
    fp16=True,
    optim="paged_adamw_8bit",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)
model.config.use_cache = False

trainable params: 884,736 || all params: 248,472,192 || trainable%: 0.3560704289999583


  table = cls._concat_blocks(blocks, axis=0)


In [58]:
model.device
# model.hf_device_map

device(type='mps', index=0)

In [55]:
# try inferring for a single example
id_to_choose = 1
inputs = tokenizer(dataset[id_to_choose]['input_text'], return_tensors='pt').to(device)
# output = base_model.generate(**inputs, max_new_tokens=256, num_beams=4)
output = model.generate(**inputs, max_new_tokens=256, num_beams=4)
# output = trainer.model.generate(**inputs, max_new_tokens=256, num_beams=4)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(dataset[id_to_choose]["target_text"])

('This study aims to assess the efficacy of an electric cigarette, or '
 'e-cigarette, in preventing and reducing smoking among adult men. The results '
 'suggest that this device may be useful for those who are not ready to quit '
 'but still want to reduce their cigarette consumption. In addition, it may '
 'help those who do not intend to quit because it can deliver large amounts of '
 'nicotine without causing side effects.')
('The use of the EC can reduce the number of cigarettes smoked and withdrawal '
 'symptoms , but the AEs reported are mainly related to a short period of use '
 '.')


In [31]:
# Train the model
trainer.train()

  0%|          | 0/156 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'train_runtime': 209.8985, 'train_samples_per_second': 1.486, 'train_steps_per_second': 0.743, 'train_loss': 13.549718612279648, 'epoch': 3.0}


TrainOutput(global_step=156, training_loss=13.549718612279648, metrics={'train_runtime': 209.8985, 'train_samples_per_second': 1.486, 'train_steps_per_second': 0.743, 'train_loss': 13.549718612279648, 'epoch': 3.0})

In [33]:
# view results
trainer.evaluate()



  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 11.186555862426758,
 'eval_runtime': 21.3928,
 'eval_samples_per_second': 1.215,
 'eval_steps_per_second': 0.187,
 'epoch': 3.0}

In [None]:
# Save model
trainer.save_model(os.path.join(output_dir, "longt5-qlora-final"))