In [1]:
!pip install unsloth fastapi uvicorn bitsandbytes accelerate



In [2]:
# from unsloth import FastLanguageModel

# model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name=model_name,
#     max_seq_length=2048,
#     dtype="float16",
#     load_in_4bit=True,      # T4 Friendly
# )



from unsloth import FastLanguageModel
import torch
import os
os.environ["ACCELERATE_DISABLE_DEVICE_MAP"] = "1"




ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="llama3-medical-lora",
    load_in_4bit=True,
    max_seq_length=2048,
    dtype=torch.float16,
)


==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.11.3 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [4]:

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [5]:
import json

input_file = "medical_data.jsonl"
output_file = "medical_fixed.jsonl"

with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
    for line in f_in:
        # replace single quotes with double quotes
        line = line.replace("'", '"')
        # parse to ensure valid JSON, then write
        data = json.loads(line)
        f_out.write(json.dumps(data) + "\n")

print("Fixed file saved as medical_fixed.jsonl")


Fixed file saved as medical_fixed.jsonl


In [6]:
from datasets import load_dataset

from datasets import load_dataset

train_data = load_dataset("json", data_files="medical_data.jsonl")["train"]
print(train_data.column_names)

['instruction', 'input', 'output']


In [7]:
def format_fn(example):
    system_prompt = "You are a medical assistant. Provide clear, safe information."

    instruction = example["instruction"]
    input_text = example["input"]
    output = example["output"]

    if input_text.strip() == "":
        prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n{instruction} [/INST] {output}</s>"
    else:
        prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n{instruction}\n{input_text} [/INST] {output}</s>"

    return {"text": prompt}


In [8]:
print(train_data[0])

{'instruction': 'fever', 'input': '', 'output': 'Follow-up question: What symptoms do you have besides fever (e.g., cough, headache, chills, sore throat)? Possible causes: Viral infections, bacterial infections, heat-related illness. What you should do: Rest, hydrate, monitor temperature, take paracetamol if appropriate. Avoid aspirin in children. Red flags: >40Â°C fever, difficulty breathing, confusion, seizure. Disclaimer: General info, not medical advice.'}


In [9]:
# def format_fn(example):
#     instruction = example["instruction"]
#     input_text = example["input"]
#     output = example["output"]

#     if input_text.strip() == "":
#         prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
#     else:
#         prompt = (
#             f"### Instruction:\n{instruction}\n\n"
#             f"### Input:\n{input_text}\n\n"
#             f"### Response:\n{output}"
#         )

#     return {"text": prompt}


In [10]:
train_data = train_data.map(format_fn)


In [11]:
train_data = train_data.map(
    format_fn,
    remove_columns=train_data.column_names
)


In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments

FastLanguageModel.for_training(model)

training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=50,
    save_steps=500,
    output_dir="llama3-medical-lora",
    fp16=True, # Recommended for T4
    report_to="none", # Disable wandb for now to avoid potential login issues
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    dataset_text_field="text",
    max_seq_length=2048,
    args=training_args,
)

trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 25 | Num Epochs = 3 | Total steps = 12
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 6,815,744 of 8,037,076,992 (0.08% trained)


Step,Training Loss


TrainOutput(global_step=12, training_loss=1.2396705945332844, metrics={'train_runtime': 63.1988, 'train_samples_per_second': 1.187, 'train_steps_per_second': 0.19, 'total_flos': 308642390212608.0, 'train_loss': 1.2396705945332844, 'epoch': 3.0})

In [13]:
model.save_pretrained("llama3-medical-lora")
tokenizer.save_pretrained("llama3-medical-lora")


('llama3-medical-lora/tokenizer_config.json',
 'llama3-medical-lora/special_tokens_map.json',
 'llama3-medical-lora/chat_template.jinja',
 'llama3-medical-lora/tokenizer.json')

In [14]:
model.save_pretrained(
    "llama3-medical-lora",
    save_base_model = True,    # <-- REQUIRED
)
tokenizer.save_pretrained("llama3-medical-lora")


('llama3-medical-lora/tokenizer_config.json',
 'llama3-medical-lora/special_tokens_map.json',
 'llama3-medical-lora/chat_template.jinja',
 'llama3-medical-lora/tokenizer.json')

In [15]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="llama3-medical-lora",
    load_in_4bit=True,
)

FastLanguageModel.for_inference(model)

prompt = "fever"

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
output = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(output[0], skip_special_tokens=True))


==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
fever, vomiting, lethargy, abdominal pain, dehydration, weight loss, jaundice.
Follow-up: Duration of fever, any blood in vomit, dark urine. Possible causes: Viral gastroenteritis, urinary tract infection, biliary colic, hepatitis. What to do: Hydrate, monitor urine output. Red flags: Fever >39Â°C, blood in vomit, persistent vomiting. Disclaimer. Disclaimer: General information only. Possible causes include serious conditions. Do NOT delay. Medical attention. Follow-up. What to do. Disclaimer. Disclaimer. Disclaimer. Disclaimer. Disclaim

In [16]:
!ls llama3-medical-lora


adapter_config.json	   checkpoint-12	    tokenizer_config.json
adapter_model.safetensors  README.md		    tokenizer.json
chat_template.jinja	   special_tokens_map.json
