In [1]:

# Install Unsloth
!pip install unsloth
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes








Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-w4r1alq1/unsloth_59085995f84d4d57857520ceab931c4b
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-w4r1alq1/unsloth_59085995f84d4d57857520ceab931c4b
  Resolved https://github.com/unslothai/unsloth.git to commit 910385afc3691cdee2420b4878f2e8a12fe226e5
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
#Load Model & Configure LoRA

from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None  # Auto-detect
load_in_4bit = True

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3-bnb-4bit",
    # Other great options:
    # "unsloth/llama-3.2-3b-bnb-4bit"
    # "unsloth/Phi-3.5-mini-instruct"
    # "unsloth/gemma-2-9b-bnb-4bit"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

print("‚úÖ Model loaded and LoRA configured!")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.2.1: Fast Mistral patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.14G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/157 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth 2026.2.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


‚úÖ Model loaded and LoRA configured!


In [6]:
from datasets import load_dataset

# Load ChatDoctor healthcare dataset
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k", split="train")

# Define medical prompt template
medical_prompt = """Below is a medical question from a patient. Provide a helpful, accurate medical response.

### Question:
{}

### Patient Context:
{}

### Medical Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

# Format the dataset
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # Format with the medical prompt template
        text = medical_prompt.format(instruction, input_text, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply formatting
dataset = dataset.map(formatting_prompts_func, batched=True)

# Split into train/eval
dataset = dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(f"‚úÖ Dataset prepared!")
print(f"üìä Train examples: {len(train_dataset)}")
print(f"üìä Eval examples: {len(eval_dataset)}")
print(f"\nüìã Sample:\n{train_dataset[0]['text'][:300]}...")

README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

data/train-00000-of-00001-5e7cb295b9cff0(‚Ä¶):   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

Map:   0%|          | 0/112165 [00:00<?, ? examples/s]

‚úÖ Dataset prepared!
üìä Train examples: 106556
üìä Eval examples: 5609

üìã Sample:
Below is a medical question from a patient. Provide a helpful, accurate medical response.

### Question:
If you are a doctor, please answer the medical questions based on the patient's description.

### Patient Context:
Hi, my son who is 5 months old seemed to have what looked like a headband on his...


In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq

# Training arguments (CORRECTED)
training_args = TrainingArguments(
    output_dir = "./medical_lora_model",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    max_steps = 500,  # Increase for better results (1000-2000)
    # num_train_epochs = 1,  # Alternative to max_steps
    learning_rate = 2e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 10,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    save_strategy = "steps",
    save_steps = 100,
    eval_strategy = "steps",  # CHANGED: was evaluation_strategy
    eval_steps = 100,
    load_best_model_at_end = True,
)

# Initialize trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False,
    args = training_args,
)

print("‚úÖ Trainer configured!")

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/106556 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/5609 [00:00<?, ? examples/s]

‚úÖ Trainer configured!


In [9]:
# Show GPU memory before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"üîß GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"üíæ {start_gpu_memory} GB of memory reserved.")

# Start training
print("\nüöÄ Starting training...")
trainer_stats = trainer.train()

# Show training results
print("\n‚úÖ Training complete!")
print(f"‚è±Ô∏è  Training time: {trainer_stats.metrics['train_runtime']:.2f} seconds")
print(f"üìà Final loss: {trainer_stats.metrics['train_loss']:.4f}")

# Show GPU memory after training
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"üíæ Peak reserved memory = {used_memory} GB ({used_percentage}%)")
print(f"üíæ Memory used for LoRA = {used_memory_for_lora} GB ({lora_percentage}%)")

üîß GPU = Tesla T4. Max memory = 14.563 GB.
üíæ 10.174 GB of memory reserved.

üöÄ Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 106,556 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 7,289,966,592 (0.58% trained)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:wandb: You chose "Don't visualize my results"
wandb: Using W&B in offline mode.
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss,Validation Loss
100,1.8651,1.821332


Unsloth: Not an error, but MistralForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Step,Training Loss,Validation Loss
100,1.8651,1.821332
200,1.8279,1.791295
300,1.7667,1.764812
400,1.7611,1.749286
500,1.7743,1.740648




0,1
eval/loss,‚ñà‚ñÖ‚ñÉ‚ñÇ‚ñÅ
eval/runtime,‚ñà‚ñÉ‚ñÇ‚ñà‚ñÅ
eval/samples_per_second,‚ñÅ‚ñà‚ñà‚ñÅ‚ñà
eval/steps_per_second,‚ñÅ‚ñà‚ñà‚ñÅ‚ñà
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñá‚ñÇ‚ñÜ‚ñÇ‚ñÖ‚ñÑ‚ñÉ‚ñÖ‚ñÑ‚ñÉ‚ñÑ‚ñÜ‚ñÅ‚ñÉ‚ñÖ‚ñÖ‚ñÑ‚ñÉ‚ñÖ‚ñÖ‚ñÉ‚ñá‚ñÑ‚ñÑ‚ñà‚ñà‚ñá‚ñÑ‚ñÖ‚ñá‚ñÑ‚ñÖ‚ñÑ‚ñÖ‚ñÉ‚ñÉ‚ñÖ‚ñÑ‚ñÖ‚ñÉ
train/learning_rate,‚ñà‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÅ
train/loss,‚ñà‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÉ‚ñÇ‚ñÇ‚ñÉ‚ñÇ‚ñÉ‚ñÉ‚ñÇ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ

0,1
eval/loss,1.74065
eval/runtime,2230.8982
eval/samples_per_second,2.514
eval/steps_per_second,1.257
total_flos,6.088480370132582e+16
train/epoch,0.03754
train/global_step,500
train/grad_norm,0.58704
train/learning_rate,0.0
train/loss,1.7743



‚úÖ Training complete!
‚è±Ô∏è  Training time: 14492.66 seconds
üìà Final loss: 1.8195
üíæ Peak reserved memory = 12.57 GB (86.315%)
üíæ Memory used for LoRA = 2.396 GB (16.453%)


In [10]:
# Enable fast inference mode
FastLanguageModel.for_inference(model)

# Test with a medical question
def test_model(question, context=""):
    prompt = medical_prompt.format(question, context, "")
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        use_cache=True
    )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract just the response part
    response = response.split("### Medical Response:")[-1].strip()
    return response

# Test examples
test_questions = [
    "What are the common symptoms of Type 2 diabetes?",
    "How can I manage high blood pressure naturally?",
    "What should I do if I have a persistent headache for 3 days?"
]

print("\nüß™ Testing the fine-tuned model:\n")
for i, question in enumerate(test_questions, 1):
    print(f"Question {i}: {question}")
    response = test_model(question)
    print(f"Response: {response}\n")
    print("-" * 80 + "\n")


üß™ Testing the fine-tuned model:

Question 1: What are the common symptoms of Type 2 diabetes?
Response: Hello, I understand your concern. Type 2 diabetes is a condition in which your blood glucose levels are too high. In the long run, it can cause serious problems such as heart disease, stroke, kidney failure, nerve damage, blindness, and amputation of the limbs. You should know that it can be prevented by doing regular physical activity, maintaining a healthy weight and eating healthy. If you have a family history of diabetes, you should be careful. I hope my answer was helpful. If you have further questions, I will be glad to help you. Kind regards!

--------------------------------------------------------------------------------

Question 2: How can I manage high blood pressure naturally?
Response: Hello, Thanks for the query. You can manage the high blood pressure with these measures

--------------------------------------------------------------------------------

Question 3: 

In [11]:
# Option 1: Save as merged model (recommended for Ollama)
print("üíæ Saving merged model for Ollama...")
model.save_pretrained_merged(
    "medical_mistral_merged",
    tokenizer,
    save_method = "merged_16bit",  # or "merged_4bit" for smaller size
)

print("‚úÖ Model saved to: medical_mistral_merged/")

# Option 2: Save LoRA adapters only (smaller, but needs base model)
print("\nüíæ Saving LoRA adapters...")
model.save_pretrained("medical_lora_adapters")
tokenizer.save_pretrained("medical_lora_adapters")
print("‚úÖ LoRA adapters saved to: medical_lora_adapters/")

üíæ Saving merged model for Ollama...


config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00003.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  33%|‚ñà‚ñà‚ñà‚ñé      | 1/3 [01:39<03:19, 99.63s/it]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 2/3 [04:44<02:30, 150.04s/it]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [05:48<00:00, 116.23s/it]
Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [02:49<00:00, 56.54s/it]


Unsloth: Merge process complete. Saved to `/content/medical_mistral_merged`
‚úÖ Model saved to: medical_mistral_merged/

üíæ Saving LoRA adapters...
‚úÖ LoRA adapters saved to: medical_lora_adapters/


In [12]:
# Convert to GGUF format (Ollama's preferred format)
print("\nüîÑ Converting to GGUF format for Ollama...")

# Quantization options (choose one):
# - "q4_k_m" - 4-bit, good balance (recommended)
# - "q5_k_m" - 5-bit, better quality
# - "q8_0" - 8-bit, best quality but larger

model.save_pretrained_gguf(
    "medical_mistral_gguf",
    tokenizer,
    quantization_method = "q4_k_m",  # 4-bit quantization
)

print("‚úÖ GGUF model saved to: medical_mistral_gguf/")
print("\nüì¶ Files ready for Ollama:")
print("   - medical_mistral_gguf/unsloth.Q4_K_M.gguf")


üîÑ Converting to GGUF format for Ollama...
Unsloth: Merging model weights to 16-bit format...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00003.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  33%|‚ñà‚ñà‚ñà‚ñé      | 1/3 [01:54<03:49, 114.58s/it]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 2/3 [03:09<01:31, 91.22s/it] 

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [04:11<00:00, 83.90s/it]
Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [03:29<00:00, 69.84s/it]


Unsloth: Merge process complete. Saved to `/content/medical_mistral_gguf`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...




Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['medical_mistral_gguf_gguf/mistral-7b-v0.3.F16.gguf']
Unsloth: [2] Converting GGUF f16 into q4_k_m. This might take 10 minutes...
Unsloth: Model files cleanup...
Unsloth: All GGUF conversions completed successfully!
Generated files: ['medical_mistral_gguf_gguf/mistral-7b-v0.3.Q4_K_M.gguf']
Unsloth: No Ollama template mapping found for model 'unsloth/mistral-7b-v0.3'. Skipping Ollama Modelfile
Unsloth: example usage for text only LLMs: llama.cpp/llama-cli --model medical_mistral_gguf_gguf/mistral-7b-v0.3.Q4_K_M.gguf -p "why is the sky blue?"
‚úÖ GGUF model saved to: medical_mistral_gguf/

üì¶ Files ready for Ollama:
   - medical_mistral_gguf/unsloth.Q4_K_M.gguf


In [None]:
print("DONE")