# End-to-End QLoRA Fine-tuning: Qwen2.5-0.5B-Instruct on ChatDoctor-HealthCareMagic-100k

This notebook demonstrates a complete QLoRA (Quantized Low-Rank Adaptation) fine-tuning pipeline for the **Qwen2.5-0.5B-Instruct** model on the **ChatDoctor-HealthCareMagic-100k** dataset.

---
## 1. Environment Setup

In [None]:
# Install required packages
!pip install -q transformers datasets accelerate peft bitsandbytes trl evaluate rouge_score huggingface_hub wandb

In [None]:
# Setup Weights & Biases for experiment tracking
import wandb

# Login to W&B (this will prompt for API key if not already logged in)
wandb.login()

# W&B Configuration
WANDB_PROJECT = "qwen-chatdoctor-qlora"  # Your project name
WANDB_RUN_NAME = "qwen2.5-0.5b-chatdoctor-run1"  # Descriptive run name

# Initialize W&B run
wandb.init(
    project=WANDB_PROJECT,
    name=WANDB_RUN_NAME,
    config={
        "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
        "dataset": "lavita/ChatDoctor-HealthCareMagic-100k",
        "lora_r": 16,
        "lora_alpha": 32,
        "lora_dropout": 0.05,
        "learning_rate": 2e-4,
        "batch_size": 4,
        "gradient_accumulation_steps": 4,
        "num_epochs": 1,
        "max_seq_length": 512,
        "optimizer": "paged_adamw_8bit",
    },
    tags=["qlora", "qwen2.5", "medical", "chatdoctor"],
)

print(f"W&B Run URL: {wandb.run.get_url()}")

In [None]:
# Import required libraries
import os
import torch
import warnings
warnings.filterwarnings('ignore')

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel,
)
from trl import SFTTrainer, SFTConfig
import evaluate
from tqdm import tqdm
from huggingface_hub import login, HfApi

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
# Configuration Constants
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
DATASET_NAME = "lavita/ChatDoctor-HealthCareMagic-100k"
OUTPUT_DIR = "./qwen-chatdoctor-qlora"
ADAPTER_NAME = "qwen2.5-0.5b-chatdoctor-qlora"  # Name for HuggingFace Hub

# Training hyperparameters
MAX_SEQ_LENGTH = 512
TRAIN_SAMPLES = 10000  # Subset for faster training (set to None for full dataset)
EVAL_SAMPLES = 500
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 1
WARMUP_RATIO = 0.03

# LoRA hyperparameters
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

print("Configuration loaded successfully!")

---
## 2. Dataset Loading & Preprocessing

The ChatDoctor-HealthCareMagic-100k dataset contains medical Q&A conversations. We'll format these into a chat template suitable for instruction fine-tuning.

In [None]:
# Load the dataset
print("Loading ChatDoctor-HealthCareMagic-100k dataset...")
dataset = load_dataset(DATASET_NAME)

print(f"\nDataset structure:")
print(dataset)

print(f"\nSample data point:")
print(dataset['train'][0])

In [None]:
# Check the column names and data structure
print("Column names:", dataset['train'].column_names)
print("\nFirst 3 examples:")
for i in range(3):
    print(f"\n--- Example {i+1} ---")
    for key, value in dataset['train'][i].items():
        print(f"{key}: {str(value)[:200]}..." if len(str(value)) > 200 else f"{key}: {value}")

In [None]:
# Load tokenizer for chat template
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Set padding token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Set padding side to right for causal LM training
tokenizer.padding_side = "right"

print(f"Tokenizer loaded. Vocab size: {tokenizer.vocab_size}")
print(f"Pad token: {tokenizer.pad_token}")
print(f"EOS token: {tokenizer.eos_token}")

In [None]:
def format_chat_template(example):
    """
    Format the dataset examples into Qwen's chat template.
    The dataset has 'input' (patient question) and 'output' (doctor response) columns.
    """
    # Create system message for medical context
    system_message = "You are a helpful and professional medical assistant. Provide accurate and empathetic medical advice based on the patient's symptoms and concerns."
    
    # Build the conversation
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": example['input']},
        {"role": "assistant", "content": example['output']}
    ]
    
    # Apply the chat template
    formatted_text = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=False
    )
    
    return {"text": formatted_text}

# Test the formatting function
sample = dataset['train'][0]
formatted_sample = format_chat_template(sample)
print("Formatted sample:")
print(formatted_sample['text'])

In [None]:
# Prepare train and eval splits
print("Preparing dataset splits...")

# Shuffle and select subsets
train_dataset = dataset['train'].shuffle(seed=42)

if TRAIN_SAMPLES:
    train_dataset = train_dataset.select(range(min(TRAIN_SAMPLES, len(train_dataset))))

# Split into train and eval
train_test_split = train_dataset.train_test_split(test_size=EVAL_SAMPLES, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

# Apply formatting
train_dataset = train_dataset.map(format_chat_template, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(format_chat_template, remove_columns=eval_dataset.column_names)

print("\nDataset preparation complete!")
print(f"Sample formatted text length: {len(train_dataset[0]['text'])} characters")

---
## 3. Model Configuration with 4-bit Quantization

We'll load the Qwen2.5-0.5B-Instruct model with 4-bit quantization using BitsAndBytes for memory efficiency.

In [None]:
# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Normalized Float 4-bit
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in bfloat16 for stability
    bnb_4bit_use_double_quant=True,  # Nested quantization for more memory savings
)

print("BitsAndBytes configuration:")
print(f"  - 4-bit quantization: Enabled")
print(f"  - Quantization type: NF4 (Normalized Float 4)")
print(f"  - Compute dtype: bfloat16")
print(f"  - Double quantization: Enabled")

In [None]:
# Load the model with quantization
print(f"Loading {MODEL_ID} with 4-bit quantization...")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Disable cache for training
model.config.use_cache = False
model.config.pretraining_tp = 1

print(f"\nModel loaded successfully!")
print(f"Model dtype: {model.dtype}")
print(f"Model device: {model.device}")

# Calculate model memory footprint
def get_model_memory(model):
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])
    return (mem_params + mem_buffers) / 1e6  # Convert to MB

print(f"Approximate model memory: {get_model_memory(model):.2f} MB")

In [None]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

print("Model prepared for QLoRA training!")

---
## 4. QLoRA Configuration

Configure LoRA adapters that will be trained on top of the frozen quantized base model.

In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=LORA_R,                      # Rank of the update matrices
    lora_alpha=LORA_ALPHA,          # Scaling factor
    lora_dropout=LORA_DROPOUT,      # Dropout probability
    bias="none",                    # Bias training strategy
    task_type="CAUSAL_LM",          # Task type for the model
    target_modules=[                # Modules to apply LoRA to
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

print("LoRA Configuration:")
print(f"  - Rank (r): {LORA_R}")
print(f"  - Alpha: {LORA_ALPHA}")
print(f"  - Dropout: {LORA_DROPOUT}")
print(f"  - Target modules: {lora_config.target_modules}")

In [None]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / all_params:.2f}%)")
    print(f"Total parameters: {all_params:,}")

print_trainable_parameters(model)

---
## 5. Training with SFTTrainer

Use TRL's SFTTrainer for supervised fine-tuning with our configured QLoRA setup.

In [None]:
# Configure training arguments using SFTConfig
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type="cosine",
    logging_steps=25,
    save_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="wandb",  # Set to "wandb" or "tensorboard" for tracking
    run_name = WANDB_RUN_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    packing=False,  # Disable packing for chat format
    dataset_text_field="text",
)

print("Training configuration:")
print(f"  - Epochs: {NUM_EPOCHS}")
print(f"  - Batch size: {BATCH_SIZE}")
print(f"  - Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  - Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  - Learning rate: {LEARNING_RATE}")
print(f"  - Max sequence length: {MAX_SEQ_LENGTH}")

In [None]:
# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
)

print("SFTTrainer initialized successfully!")

In [None]:
# Start training
print("Starting QLoRA fine-tuning...")
print("=" * 50)

# Clear CUDA cache before training
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Train the model
train_result = trainer.train()

print("\n" + "=" * 50)
print("Training completed!")

In [None]:
# Print training metrics
print("Training Metrics:")
print(f"  - Total training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"  - Training samples per second: {train_result.metrics['train_samples_per_second']:.2f}")
print(f"  - Training steps per second: {train_result.metrics['train_steps_per_second']:.2f}")
print(f"  - Final training loss: {train_result.metrics['train_loss']:.4f}")

In [None]:
# Save the trained LoRA adapters
print(f"Saving LoRA adapters to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Model and tokenizer saved successfully!")

---
## 6. Model Evaluation with ROUGE Metrics

Evaluate the fine-tuned model using ROUGE-1, ROUGE-2, and ROUGE-L metrics to measure the quality of generated responses.

In [None]:
# Load ROUGE metric
rouge = evaluate.load('rouge')

print("ROUGE metric loaded successfully!")

In [None]:
# Reload the original eval dataset (before formatting) for evaluation
print("Preparing evaluation data...")

# Reload dataset to get original input/output pairs
eval_raw_dataset = load_dataset(DATASET_NAME, split='train')
eval_raw_dataset = eval_raw_dataset.shuffle(seed=42).select(range(TRAIN_SAMPLES if TRAIN_SAMPLES else len(eval_raw_dataset)))
eval_raw_split = eval_raw_dataset.train_test_split(test_size=EVAL_SAMPLES, seed=42)
eval_raw_dataset = eval_raw_split['test']

# Use a smaller subset for faster evaluation
EVAL_SUBSET_SIZE = 100  # Evaluate on 100 samples for speed
eval_subset = eval_raw_dataset.select(range(min(EVAL_SUBSET_SIZE, len(eval_raw_dataset))))

print(f"Evaluation subset size: {len(eval_subset)}")

In [None]:
def generate_response(model, tokenizer, input_text, max_new_tokens=256):
    """
    Generate a response from the model for a given input.
    """
    # Create system message
    system_message = "You are a helpful and professional medical assistant. Provide accurate and empathetic medical advice based on the patient's symptoms and concerns."
    
    # Build the conversation (without assistant response for generation)
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": input_text},
    ]
    
    # Apply chat template with generation prompt
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LENGTH).to(model.device)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode only the generated part
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    
    return generated_text.strip()

In [None]:
# Set model to evaluation mode
model.eval()

# Generate predictions
print("Generating predictions for ROUGE evaluation...")

predictions = []
references = []

for example in tqdm(eval_subset, desc="Evaluating"):
    # Generate response
    prediction = generate_response(model, tokenizer, example['input'])
    predictions.append(prediction)
    references.append(example['output'])

print(f"\nGenerated {len(predictions)} predictions.")

In [None]:
# Show a few example predictions vs references
print("\n" + "=" * 80)
print("Sample Predictions vs References")
print("=" * 80)

for i in range(min(3, len(predictions))):
    print(f"\n--- Example {i+1} ---")
    print(f"\nInput: {eval_subset[i]['input'][:300]}...")
    print(f"\nReference: {references[i][:300]}...")
    print(f"\nPrediction: {predictions[i][:300]}...")
    print("-" * 80)

In [None]:
# Compute ROUGE scores
print("Computing ROUGE metrics...")

rouge_results = rouge.compute(
    predictions=predictions,
    references=references,
    use_aggregator=True
)

print("\n" + "=" * 50)
print("ROUGE Evaluation Results (Fine-tuned Model)")
print("=" * 50)
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")
print(f"ROUGE-Lsum: {rouge_results['rougeLsum']:.4f}")
print("=" * 50)

In [None]:
# Compare with base model performance
# This helps demonstrate the improvement from fine-tuning

print("\n" + "=" * 50)
print("Loading base model for comparison...")
print("=" * 50)

# Load base model (without LoRA)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)
base_model.eval()

# Generate predictions with base model
print("Generating predictions with base model...")

base_predictions = []
for example in tqdm(eval_subset, desc="Base Model Evaluation"):
    prediction = generate_response(base_model, tokenizer, example['input'])
    base_predictions.append(prediction)

# Compute ROUGE for base model
base_rouge_results = rouge.compute(
    predictions=base_predictions,
    references=references,
    use_aggregator=True
)

print("\n" + "=" * 50)
print("ROUGE Evaluation Comparison")
print("=" * 50)
print(f"{'Metric':<15} {'Base Model':<15} {'Fine-tuned':<15} {'Improvement':<15}")
print("-" * 60)
for metric in ['rouge1', 'rouge2', 'rougeL']:
    base_score = base_rouge_results[metric]
    ft_score = rouge_results[metric]
    improvement = ft_score - base_score
    print(f"{metric.upper():<15} {base_score:<15.4f} {ft_score:<15.4f} {improvement:+.4f}")
print("=" * 50)

# Clean up base model
del base_model
if torch.cuda.is_available():
    torch.cuda.empty_cache()

---
## 7. Push Adapters to HuggingFace Hub

Upload the trained LoRA adapters to your HuggingFace account for sharing and deployment.

In [None]:
# Login to HuggingFace Hub
login()

In [None]:
# Get your HuggingFace username
api = HfApi()
user_info = api.whoami()
username = user_info['name']

print(f"Logged in as: {username}")

# Define the repository name
repo_name = f"{username}/{ADAPTER_NAME}"
print(f"Repository name: {repo_name}")

In [None]:
# Create model card content
model_card = f"""---
base_model: {MODEL_ID}
library_name: peft
license: apache-2.0
tags:
- qwen2
- qlora
- medical
- healthcare
- chatdoctor
- peft
- lora
datasets:
- lavita/ChatDoctor-HealthCareMagic-100k
language:
- en
---

# {ADAPTER_NAME}

This is a QLoRA fine-tuned adapter for **{MODEL_ID}** trained on the **ChatDoctor-HealthCareMagic-100k** medical Q&A dataset.

## Model Details

- **Base Model**: {MODEL_ID}
- **Fine-tuning Method**: QLoRA (4-bit quantization + LoRA)
- **Dataset**: [ChatDoctor-HealthCareMagic-100k](https://huggingface.co/datasets/lavita/ChatDoctor-HealthCareMagic-100k)
- **Training Samples**: {len(train_dataset):,}
- **Evaluation Samples**: {len(eval_dataset):,}

## LoRA Configuration

- **Rank (r)**: {LORA_R}
- **Alpha**: {LORA_ALPHA}
- **Dropout**: {LORA_DROPOUT}
- **Target Modules**: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj

## Training Configuration

- **Epochs**: {NUM_EPOCHS}
- **Batch Size**: {BATCH_SIZE}
- **Gradient Accumulation Steps**: {GRADIENT_ACCUMULATION_STEPS}
- **Learning Rate**: {LEARNING_RATE}
- **Max Sequence Length**: {MAX_SEQ_LENGTH}

## Evaluation Results (ROUGE Metrics)

| Metric | Score |
|--------|-------|
| ROUGE-1 | {rouge_results['rouge1']:.4f} |
| ROUGE-2 | {rouge_results['rouge2']:.4f} |
| ROUGE-L | {rouge_results['rougeL']:.4f} |

## Usage

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(
    "{MODEL_ID}",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained("{MODEL_ID}")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "{repo_name}")

# Generate response
messages = [
    {{"role": "system", "content": "You are a helpful medical assistant."}},
    {{"role": "user", "content": "What are the symptoms of diabetes?"}},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```

## Training Framework

- **transformers**: HuggingFace Transformers
- **peft**: Parameter-Efficient Fine-Tuning
- **trl**: Transformer Reinforcement Learning (SFTTrainer)
- **bitsandbytes**: 4-bit quantization

## Disclaimer

This model is for educational and research purposes only. It should not be used as a substitute for professional medical advice, diagnosis, or treatment. Always seek the advice of a qualified healthcare provider.
"""

# Save model card
with open(os.path.join(OUTPUT_DIR, "README.md"), "w") as f:
    f.write(model_card)

print("Model card created!")

In [None]:
# Push the model to HuggingFace Hub
print(f"Pushing adapters to HuggingFace Hub: {repo_name}")
print("This may take a few minutes...")

# Push model and tokenizer
model.push_to_hub(
    repo_name,
    use_auth_token=True,
    commit_message="Upload QLoRA adapters for Qwen2.5-0.5B-Instruct fine-tuned on ChatDoctor"
)

tokenizer.push_to_hub(
    repo_name,
    use_auth_token=True,
    commit_message="Upload tokenizer"
)

print(f"\n" + "=" * 50)
print("SUCCESS! Adapters pushed to HuggingFace Hub!")
print(f"Repository URL: https://huggingface.co/{repo_name}")
print("=" * 50)