In [1]:
# Install and import the necessary libraries
!pip install -q torch
!pip install -q -U accelerate peft bitsandbytes transformers trl einops

In [2]:
import os
import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
    logging,
)

from trl import SFTTrainer

In [3]:
# Model
base_model = "microsoft/phi-2"
new_model = "phi-2-medquad"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [4]:
from datasets import load_dataset, DatasetDict, Dataset
dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a function to transform the dataset
def format_example(example):
    # Use the specified format for text
    example["text"] = f"### Instruction: {example['Question']} ### Assistant: {example['Answer']}"
    return example

train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset=train_test_split["train"]
train_dataset = train_dataset
test_dataset=train_test_split["test"]
test_dataset = test_dataset

# Apply the transformation to both the train and test datasets
train_dataset = train_dataset.map(format_example)

# Remove unnecessary columns and keep only the "text" column
train_dataset = train_dataset.remove_columns(["qtype", "Question", "Answer"])
# print(formatted_dataset)

# Preview the dataset
print(train_dataset)
print(test_dataset)

README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

Map:   0%|          | 0/13125 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 13125
})
Dataset({
    features: ['qtype', 'Question', 'Answer'],
    num_rows: 3282
})


In [5]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    device_map='auto',
    revision="refs/pr/23" #the main version of Phi-2 doesn’t support gradient checkpointing (while training this model)
)

config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

modeling_phi.py:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/577M [00:00<?, ?B/s]

PhiModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

In [6]:
model.config.use_cache = False
model.config.pretraining_tp = 1
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [10]:
# Set training arguments
training_arguments = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 1,
    fp16 = False,
    bf16 = False,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 1,
    gradient_checkpointing = True,
    max_grad_norm = 0.3,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    optim = "paged_adamw_32bit",
    lr_scheduler_type = "cosine",
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    save_steps = 0,
    logging_steps = 100,
)

# LoRA configuration
peft_config = LoraConfig(
    r=64,                   #default=8
    lora_alpha= 16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "out_proj"] #["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ]
)


In [11]:
#print_trainable_parameters(model)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length= 512,
    tokenizer=tokenizer,
    args=training_arguments,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


Map:   0%|          | 0/13125 [00:00<?, ? examples/s]

In [12]:
# Train model
trainer.train()

  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss
100,1.7314
200,1.3981
300,1.2573
400,1.2279
500,1.275
600,1.2082
700,1.2649
800,1.1373
900,1.1666
1000,1.1746


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.


TrainOutput(global_step=6563, training_loss=1.138123814078451, metrics={'train_runtime': 14511.9133, 'train_samples_per_second': 0.904, 'train_steps_per_second': 0.452, 'total_flos': 5.171432429205504e+16, 'train_loss': 1.138123814078451, 'epoch': 1.0})

In [13]:
trainer.model.save_pretrained("./phi2_finetuned_subjective/final_model")

You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.


In [14]:
from peft import PeftModel
f_model = PeftModel.from_pretrained(model,'/kaggle/working/phi2_finetuned_subjective/final_model')
f_model = f_model.merge_and_unload()



In [15]:
print(test_dataset[0])

{'qtype': 'treatment', 'Question': 'What are the treatments for Developmental Dyspraxia ?', 'Answer': 'Treatment is symptomatic and supportive and may include occupational and speech therapy, and "cueing" or other forms of communication such as using pictures and hand gestures. Many children with the disorder require special education.'}


In [None]:
from tqdm import tqdm

model.eval()  # Set the model to evaluation mode
predictions = []
all_preds = []  # Ensure this is defined for appending predictions
device = next(model.parameters()).device  # Ensure device compatibility
batch_size = 1  # Set a batch size if needed

for i, question in enumerate(tqdm(test_dataset, desc="Generating predictions", unit="question")):
    # Prepare inputs for the model
#     print(question)
#     break
    prompt = question['Question']
#     instruction = f"### Instruction: {prompt} "
#     print(instruction)
    
#     pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
#     result = pipe(instruction)
#     print(result[0]['generated_text'][len(instruction):])

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(inputs['input_ids'], max_length=70)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Store the response
#     print(response)
#     print(response.split(instruction))
    predictions.append(response)

    # Optionally, append predictions to all_preds every 500 questions
#     if (i + 1) % 500 == 0:
#         all_preds.append(predictions)
#     break
# return predictions


In [None]:
# print(predictions)

In [None]:
# !pip -q install evaluate
# !pip -q install rouge-score
# # Load ROUGE for evaluation
# import evaluate

# # Load ROUGE for evaluation
# rouge = evaluate.load("rouge")

# # Evaluate predictions


# # Prepare references (ground-truth answers)
# references = test_dataset['Answer']

# # Evaluate predictions



In [None]:
# scores = rouge.compute(predictions=predictions, references=references)
# print(scores)

In [None]:
# bleu = evaluate.load("bleu")
# scores = bleu.compute(predictions=predictions, references=references)
# print(scores)

In [None]:
# !pip install nltk
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# meteor = evaluate.load("meteor")
# scores = meteor.compute(predictions=predictions, references=references)
# print(scores)

In [None]:
# !pip -q install bert_score
# from bert_score import score

# P, R, F1 = score(predictions, references, lang="en")
# print(f"BERTScore F1: {F1.mean().item()}")