In [None]:
!pip install transformers accelerate bitsandbytes datasets sentence_transformers datasets peft trl fire handler



In [None]:
import os
import sys
from typing import Tuple, Union
from huggingface_hub import login
from google.colab import userdata

import fire
import torch
from datasets import load_dataset, concatenate_datasets, load_from_disk

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    PromptTuningConfig,
    TaskType,
    PromptTuningInit
)

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    LlamaForCausalLM,
    LlamaTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)

from trl import SFTTrainer

from google.colab import drive

In [None]:
# log into huggingface hub
login(token=userdata.get('HF_KEY'))
!huggingface-cli whoami

In [None]:
# add google drive functionality for storing save states
mountpoint = '/content/drive'
drive.mount(mountpoint)

In [None]:
# parameters for quantization, makes it easier for less powerful GPUs to run the model
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
)

# create instance of Llama2 base model and quantize
model_id = 'meta-llama/Llama-2-7b-hf'
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map='auto'
)
model = prepare_model_for_int8_training(model)

In [None]:
# derive tokenizer from base model
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model.config.pretraining_tp = 1

In [None]:
# load medical data from HuggingFace
dataset = load_dataset(userdata.get('DATASET_URL'), split='train')

In [None]:
# parameters for peft
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
# directory to store save states
output_dir = os.path.join(mountpoint, 'My Drive', 'checkpoints')

In [None]:
# arguments for supervised fine tuning
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim='paged_adamw_32bit',
    save_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    logging_first_step=True,
    fp16=True,
    max_grad_norm=0.3,
    max_steps=5000,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)
model.config.use_cache=False

In [None]:
# data collator organizes data into batches and truncates them to equal length
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# create isntance of supervised fine tuning trainer and define what data column to reference
trainer = SFTTrainer(
    model,
    peft_config=peft_config,
    train_dataset=dataset,
    dataset_text_field="input",
    max_seq_length=512,
    args=training_args,
    data_collator=data_collator,
    packing=False
)

In [None]:
# run the trainer, search for last save state if ran after being interupted
trainer.train(resume_from_checkpoint = True)

In [None]:
# save model to google drive directory
model_dir = os.path.join(mountpoint, 'MyDrive', 'medai-model')
trainer.save_model(model_dir)