In [None]:
!pip install --no-deps packaging ninja einops xformers flash-attn trl peft accelerate bitsandbytes
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install bert-score
!pip install bitsandbytes peft

In [None]:
from huggingface_hub import notebook_login

# Login to Hugging Face
notebook_login()

In [None]:
import bitsandbytes as bnb
print(bnb.__version__)

In [None]:
!pip install wandb

In [None]:
from huggingface_hub import login
login()  # This will prompt you to enter your token

In [None]:
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType
import torch

In [None]:
config = {
    "hugging_face_username":"Apurva3509",
    "model_config": {
        "base_model": "distilgpt2",  
        "finetuned_model":"distilgpt2-medical-finetuned",
        "max_seq_length": 512
    },
    "training_dataset": {
        "name": "data/medical_llama3_instruct_dataset_short",
        "split": "train",
        "input_field": "prompt"
    },
    "training_config": {
        "per_device_train_batch_size": 4,
        "gradient_accumulation_steps": 4,
        "warmup_steps": 500,
        "num_train_epochs": 5,
        "learning_rate": 2e-4,
        "fp16": True,
        "logging_steps": 1,
        "optim" :"adamw_8bit", #  optimizer
        "weight_decay" : 0.01,  #  weight decay
        "lr_scheduler_type": "linear", 
        "seed" : 42, #  seed
        "output_dir": "./medical_model_output"
    }
}

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True
)

model_name = config["model_config"]["base_model"]
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer(
        examples[config["training_dataset"]["input_field"]],
        padding=True,
        truncation=True,
        max_length=config["model_config"]["max_seq_length"]
    )

dataset = load_dataset(config["training_dataset"]["name"])

dataset = DatasetDict({
    "train": dataset["train"].shuffle(seed=42).select(range(1800)),
    "validation": dataset["train"].shuffle(seed=42).select(range(1800, 2000))
})

tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_eval = dataset["validation"].map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# print("Model layers:")
# for name, module in model.named_modules():
#     print(name)

lora_config = LoraConfig(
    r=8,  
    lora_alpha=32,  
    target_modules=["transformer.h.0.attn.c_attn"],  
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        inputs['labels'] = inputs['input_ids']
        outputs = model(**inputs)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir=config["training_config"]["output_dir"],
    per_device_train_batch_size=config["training_config"]["per_device_train_batch_size"],
    gradient_accumulation_steps=config["training_config"]["gradient_accumulation_steps"],
    warmup_steps=config["training_config"]["warmup_steps"],
    num_train_epochs=config["training_config"]["num_train_epochs"],
    learning_rate=config["training_config"]["learning_rate"],
    fp16=config["training_config"]["fp16"],
    logging_dir="./logs",
    logging_steps=config["training_config"]["logging_steps"],
    save_steps=100,
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=100,
    push_to_hub=False,
    run_name="medical_model_finetuning",
    report_to="wandb"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

trainer.train()

model.save_pretrained(config["training_config"]["output_dir"])
print(f"Model saved to {config['training_config']['output_dir']}")

7789b62606e1486702bb3700ba4ce6f8021bcfed

In [None]:
repo_name = f"{config['hugging_face_username']}/{config['model_config']['finetuned_model']}"

In [None]:
repo_id = f"{config['hugging_face_username']}/{config['model_config']['finetuned_model']}"

import tempfile
import os
from huggingface_hub import HfApi

with tempfile.TemporaryDirectory() as tmp_dir:
    model.save_pretrained(
        tmp_dir,
        safe_serialization=True
    )
    tokenizer.save_pretrained(tmp_dir)

    api = HfApi()

    api.upload_folder(
        folder_path=tmp_dir,
        repo_id=repo_id,
        repo_type="model",
        token="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"  
    )

In [None]:
complex_input = """
Patient Information:
A 45-year-old male presents with persistent cough, weight loss, fatigue, and night sweats.
History of smoking 20 pack-years. No fever reported.

Additional Context:
Recently returned from a region with high tuberculosis prevalence.
Basic blood work shows elevated ESR but normal WBC count.

Question:
What are the potential diagnoses, reasons for them, and suggested next steps?
"""

inputs = tokenizer([complex_input], return_tensors="pt").to("cuda")

response = model.generate(
    **inputs,
    max_new_tokens=512,
    do_sample=True,  # Enable sampling
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,  # Add repetition penalty
    no_repeat_ngram_size=3,  # Prevent repetition of n-grams
    early_stopping=True
)
decoded_response = tokenizer.decode(response[0], skip_special_tokens=True)
print(f"Generated Response:\n{decoded_response}")


###Sample Answer

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generated Response:

Patient Information:
A 45-year-old male presents with persistent cough, weight loss, fatigue, and night sweats.
History of smoking 20 pack-years. No fever reported.

Additional Context:
Recently returned from a region with high tuberculosis prevalence.
Basic blood work shows elevated ESR but normal WBC count.

Question:
What are the potential diagnoses, reasons for them, and suggested next steps?
This is an open question (and it's not just about one). A person has to be hospitalized or diagnosed with chronic disease if they have had severe illness prior to their life in this country as well. It must also come at the same time that people living under conditions where there will be no infection because those who live near these areas get sick."I've been asked how many times I can see some cases on my own without having any symptoms until now," says Mr Naira said.

In [None]:
from bert_score import score

reference_responses = ["The symptoms suggest tuberculosis due to the patient's travel history and elevated ESR."]
P, R, F1 = score([decoded_response], reference_responses, lang="en", verbose=True)

print(f"BERTScore - Precision: {P.mean().item()}, Recall: {R.mean().item()}, F1: {F1.mean().item()}")
