In [1]:
!pip install torch
!pip install -q -U accelerate peft bitsandbytes transformers trl einops
!pip install -q -U deepeval



In [2]:
import os
import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
    logging,
)
from tqdm import tqdm
from trl import SFTTrainer
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.benchmarks import TruthfulQA
from deepeval.benchmarks.tasks import TruthfulQATask
from deepeval.benchmarks.modes import TruthfulQAMode
import gc

In [3]:
# Model
base_model = "microsoft/phi-2"
new_model = "phi-2-medquad"

# Dataset
dataset = load_dataset("prsdm/MedQuad-phi2-1k", split="train")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Phi2 Model fine tuning

In [4]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

In [5]:
# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    device_map={"": 0},
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
model.config.use_cache = False
model.config.pretraining_tp = 1
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [7]:
train_dataset = dataset.select(range(5))
test_dataset = dataset.select(range(990,1000))
epochs = 3

In [8]:
# Set training arguments
# rank = 1
training_arguments = TrainingArguments(
    output_dir = "./results",
    report_to="none",
    num_train_epochs = epochs,
    fp16 = False,
    bf16 = False,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = 1,
    gradient_checkpointing = True,
    max_grad_norm = 0.3,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    optim = "paged_adamw_32bit",
    lr_scheduler_type = "cosine",
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    save_strategy = "epoch",
    save_steps = 0,
    logging_steps = 50,
)

In [9]:
# LoRA configuration
peft_config = LoraConfig(
    r=1,                   #default=8
    lora_alpha= 16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ['fc2', 'v_proj','k_proj', 'q_proj', 'fc1']
)

In [10]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length= None,
    tokenizer=tokenizer,
    args=training_arguments,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [11]:
# Training the model and getting the training output
trainer.train()



Step,Training Loss




TrainOutput(global_step=15, training_loss=0.9169851938883463, metrics={'train_runtime': 179.8069, 'train_samples_per_second': 0.083, 'train_steps_per_second': 0.083, 'total_flos': 481374332559360.0, 'train_loss': 0.9169851938883463, 'epoch': 3.0})

## Evaluation

In [12]:
class Phi2(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        # model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps):
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        # model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "Phi2"

In [13]:
# Define benchmark with specific tasks and shots
benchmark = TruthfulQA(
    tasks=[TruthfulQATask.ADVERTISING],
    mode=TruthfulQAMode.MC2
    )
# available taks: https://docs.confident-ai.com/docs/benchmarks-truthful-qa

In [14]:
checkpoints = len(train_dataset)
for i in range(checkpoints, checkpoints * epochs + 1, checkpoints):
  model_checkpoint = AutoModelForCausalLM.from_pretrained(
    f"/content/results/checkpoint-{i}",
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
  )
  benchmarking_model = Phi2(model=model_checkpoint,tokenizer=tokenizer)
  benchmark.evaluate(model=benchmarking_model)
  print("score: ",benchmark.overall_score)
  del model_checkpoint, benchmarking_model
  torch.cuda.empty_cache()
  gc.collect()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Filter:   0%|          | 0/817 [00:00<?, ? examples/s]

Processing Advertising:   0%|          | 0/13 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:   8%|▊         | 1/13 [00:01<00:14,  1.24s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  15%|█▌        | 2/13 [00:02<00:13,  1.24s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  23%|██▎       | 3/13 [00:10<00:45,  4.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  31%|███       | 4/13 [00:11<00:27,  3.10s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  38%|███▊      | 5/13 [00:14<00:23,  2.95s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  46%|████▌     | 6/13 [00:21<00:30,  4.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Adv

TruthfulQA Task Accuracy (task=Advertising): 0.0
Overall TruthfulQA Accuracy: 0.0
score:  0.0




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Filter:   0%|          | 0/817 [00:00<?, ? examples/s]

Processing Advertising:   0%|          | 0/13 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:   8%|▊         | 1/13 [00:08<01:37,  8.17s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  15%|█▌        | 2/13 [00:15<01:23,  7.56s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  23%|██▎       | 3/13 [00:23<01:18,  7.86s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  31%|███       | 4/13 [00:31<01:11,  7.90s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  38%|███▊      | 5/13 [00:38<01:01,  7.70s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  46%|████▌     | 6/13 [00:39<00:37,  5.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Adv

TruthfulQA Task Accuracy (task=Advertising): 0.0
Overall TruthfulQA Accuracy: 0.0
score:  0.0


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Filter:   0%|          | 0/817 [00:00<?, ? examples/s]

Processing Advertising:   0%|          | 0/13 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:   8%|▊         | 1/13 [00:00<00:10,  1.12it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  15%|█▌        | 2/13 [00:01<00:09,  1.21it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  23%|██▎       | 3/13 [00:03<00:14,  1.40s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  31%|███       | 4/13 [00:12<00:37,  4.12s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  38%|███▊      | 5/13 [00:14<00:28,  3.52s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Advertising:  46%|████▌     | 6/13 [00:21<00:33,  4.84s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Processing Adv

TruthfulQA Task Accuracy (task=Advertising): 0.0
Overall TruthfulQA Accuracy: 0.0
score:  0.0
