In [1]:
# 1. Install Unsloth & Core ML Dependencies with specific patches
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.12.0 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer requests==2.32.4
!pip install --no-deps unsloth

# 2. Install RAG & LangChain Ecosystem
!pip install -U langchain langchain-community langchain-text-splitters langchain-huggingface faiss-cpu pypdf

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting trl==0.12.0
  Downloading trl-0.12.0-py3-none-any.whl.metadata (10 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2026.1.4-py3-none-any.whl.metadata (32 kB)
Downloading xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m43.4/43.4 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.12.0-py3-none-any.whl (310 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m310.2/310

In [5]:
# --- STEP 0: MUST BE FIRST ---
import unsloth
from unsloth import FastLanguageModel

# --- STEP 1: Other Imports ---
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import torch
import os

# --- STEP 2: Load the Quantized Model ---
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

# --- STEP 3: Index the Document ---
file_path = "medical_guide.pdf"

if os.path.exists(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100)
    docs = text_splitter.split_documents(documents)

    # Lightweight embeddings for faster performance in Colab
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_db = FAISS.from_documents(docs, embeddings)
    print(f"‚úÖ Successfully indexed {len(docs)} medical knowledge chunks!")
else:
    print(f"‚ùå Error: '{file_path}' not found in the sidebar.")

==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ Successfully indexed 231 medical knowledge chunks!


In [6]:
def ask_medical_assistant(question):
    # 1. Retrieve the top 3 most relevant chunks from your PDF
    retrieved_docs = vector_db.similarity_search(question, k=3)
    context = "\n".join([d.page_content for d in retrieved_docs])

    # 2. Construct the RAG Prompt
    prompt = f"""### System:
    You are a professional medical assistant. Use the provided context from the medical guide to answer the question.
    If the answer isn't in the context, state that you don't know based on the guide.

    ### Context:
    {context}

    ### Question:
    {question}

    ### Grounded Response:
    """

    # 3. Generate response using the 4-bit quantized model
    inputs = tokenizer([prompt], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 256, repetition_penalty=1.2)
    response = tokenizer.decode(outputs[0], skip_special_tokens = True).split("### Grounded Response:")[-1]

    return response

# Test it!
print(ask_medical_assistant("What are the first steps for treating a severe burn?"))


     I do not know based on this guide. The guidance only provides general information about minor burns, but does not specify what should be done when dealing with more serious cases such as those described here.


In [7]:
# Install OpenAI Whisper and audio processing tools
!pip install openai-whisper ffmpeg-python
import whisper
# Pre-load the model here so the final cell is faster
whisper_model = whisper.load_model("base")

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m [32m798.7/803.2 kB[0m [31m30.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m803.2/803.2 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (2

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 139M/139M [00:01<00:00, 114MiB/s]


In [8]:
from IPython.display import display, HTML
from google.colab.output import eval_js
from base64 import b64decode

# UI with both Audio and Textbox
UI_HTML = """
<div style="border: 2px solid #4CAF50; padding: 20px; border-radius: 15px; width: 420px; background-color: #f9f9f9; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; box-shadow: 0 4px 8px rgba(0,0,0,0.1);">
    <h3 style="color: #2e7d32; margin-top: 0; text-align: center;">üè• Medical Assistant</h3>

    <div style="text-align: center; margin-bottom: 15px;">
        <div id="recording-status" style="color: #666; font-size: 14px; margin-bottom: 5px;">Microphone: Ready</div>
        <button id="record-btn" style="padding: 10px 20px; cursor: pointer; background-color: #4CAF50; color: white; border: none; border-radius: 25px; font-weight: bold; transition: 0.3s;">üé§ Start Recording</button>
    </div>

    <div style="display: flex; align-items: center; margin: 15px 0;">
        <hr style="flex: 1; border: 0; border-top: 1px solid #ccc;">
        <span style="padding: 0 10px; color: #888; font-size: 12px;">OR</span>
        <hr style="flex: 1; border: 0; border-top: 1px solid #ccc;">
    </div>

    <label style="font-weight: bold; display: block; margin-bottom: 8px; color: #333;">Type your medical question:</label>
    <textarea id="text-input" rows="3" style="width: 100%; border-radius: 8px; border: 1px solid #ccc; padding: 10px; box-sizing: border-box; resize: none;" placeholder="e.g., First aid for a burn?"></textarea>

    <button id="submit-btn" style="margin-top: 15px; width: 100%; padding: 12px; cursor: pointer; background-color: #2196F3; color: white; border: none; border-radius: 8px; font-weight: bold; font-size: 16px;">üîç Get Answer</button>
</div>

<script>
  var recordBtn = document.getElementById('record-btn');
  var status = document.getElementById('recording-status');
  var submitBtn = document.getElementById('submit-btn');
  var textInput = document.getElementById('text-input');
  var recorder, gumStream;
  var audioBase64 = null;

  recordBtn.onclick = async () => {
    if (!recorder || recorder.state === "inactive") {
      gumStream = await navigator.mediaDevices.getUserMedia({ audio: true });
      recorder = new MediaRecorder(gumStream);
      var chunks = [];
      recorder.ondataavailable = (e) => chunks.push(e.data);
      recorder.onstop = async () => {
        var blob = new Blob(chunks);
        var reader = new FileReader();
        reader.readAsDataURL(blob);
        reader.onloadend = () => { audioBase64 = reader.result; };
      };
      recorder.start();
      recordBtn.innerText = "üõë Stop Recording";
      recordBtn.style.backgroundColor = "#f44336";
      status.innerText = "üî¥ RECORDING...";
    } else {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordBtn.innerText = "‚úÖ Voice Captured";
      recordBtn.style.backgroundColor = "#4CAF50";
      status.innerText = "Voice ready to process";
    }
  };

  submitBtn.onclick = () => {
    window.finalOutput = {
      text: textInput.value,
      audio: audioBase64
    };
  };
</script>
"""

def process_input():
    display(HTML(UI_HTML))

    while True:
        data = eval_js("window.finalOutput || null")
        if data:
            eval_js("window.finalOutput = null")
            return data

# --- EXECUTION FLOW ---
user_data = process_input()
user_query = ""

# 1. Check Text Input First
if user_data['text'].strip():
    user_query = user_data['text']
    print(f"‚úçÔ∏è Processing Text Input...")
# 2. If no text, check Audio Input
elif user_data['audio']:
    print("üéôÔ∏è Processing Voice Input...")
    audio_bytes = b64decode(user_data['audio'].split(',')[1])
    with open('query.wav', 'wb') as f:
        f.write(audio_bytes)

    # Use the pre-loaded whisper_model from Cell 4
    result = whisper_model.transcribe("query.wav")
    user_query = result["text"]
    print(f"üó£Ô∏è Transcription: \"{user_query}\"")
else:
    print("‚ö†Ô∏è No input detected. Please record or type a question.")

# 3. Final RAG Output
if user_query:
    print("üìã Retrieving grounded response...")
    response = ask_medical_assistant(user_query)
    print("\n" + "="*50)
    print(f"üè• MEDICAL ASSISTANT:\n{response}")
    print("="*50)

‚úçÔ∏è Processing Text Input...
üìã Retrieving grounded response...

üè• MEDICAL ASSISTANT:

     In case of severe bleeding from injury, prioritize stopping the bleeding over other first-aid actions. This includes applying pressure to the affected area using gauze or a cloth to control the flow of blood. However, do not attempt to stop the bleeding by tying a tourniquet around the affected limb unless instructed to do so by trained personnel. 

      After controlling the bleeding, maintain the casualty's airway, ensure adequate circulation, and monitor vital signs. Then, call 999 or 112 for emergency assistance while keeping the casualty calm and comfortable. Ensure proper positioning of the casualty to minimize discomfort during transportation. Follow established protocols for transporting casualties requiring advanced care.

      If available, use equipment designed specifically for traumatic bleeding management, following manufacturer guidelines and local regulations. Always ke