In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import subprocess, sys

def pip_install(*pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + list(pkgs))

pip_install(
    "fastapi==0.115.0", "uvicorn[standard]==0.30.0", "pydantic>=2.0",
    "numpy>=1.24", "sherpa-onnx>=1.12.0", "soundfile>=0.12.1",
    "pyngrok==7.1.6", "torchaudio>=2.0", "huggingface_hub",
)

# llama-cpp-python >= 0.3.8 needed for Gemma 3 architecture
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-q",
    "llama-cpp-python>=0.3.8",
    "--extra-index-url",
    "https://abetlen.github.io/llama-cpp-python/whl/cu124",
])

# Verify
import llama_cpp
lib = getattr(llama_cpp, "llama_cpp", None)
gpu_ok = lib.llama_supports_gpu_offload() if lib else False
print(f"✅ llama-cpp-python {llama_cpp.__version__}, GPU offload: {gpu_ok}")

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    yes
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 2 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5, VMM: yes
  Device 1: Tesla T4, compute capability 7.5, VMM: yes


✅ llama-cpp-python 0.3.16, GPU offload: True


In [3]:
import os,subprocess,sys
from pathlib import Path

REPO_DIR = Path("/kaggle/working/OR-Simulation")
if not REPO_DIR.exists():
    subprocess.check_call(["git", "clone",
        "https://github.com/Aditya-Lingam-9000/OR-Simulation.git",
        str(REPO_DIR)])
else:
    subprocess.check_call(["git", "-C", str(REPO_DIR), "pull"])

os.chdir(REPO_DIR)
sys.path.insert(0, str(REPO_DIR))
print(f"✅ Repo ready at {REPO_DIR}")

Already up to date.
✅ Repo ready at /kaggle/working/OR-Simulation


In [4]:
from huggingface_hub import hf_hub_download

# --- MedASR (ONNX int8, ~150 MB) ---
ONNX_DIR = REPO_DIR / "onnx_models" / "medasr"
ONNX_DIR.mkdir(parents=True, exist_ok=True)

hf_hub_download(
    repo_id="csukuangfj/sherpa-onnx-medasr-ctc-en-int8-2025-12-25",
    filename="model.int8.onnx",
    local_dir=str(ONNX_DIR),
)
hf_hub_download(
    repo_id="csukuangfj/sherpa-onnx-medasr-ctc-en-int8-2025-12-25",
    filename="tokens.txt",
    local_dir=str(ONNX_DIR),
)
print("✅ MedASR model downloaded")

# --- MedGemma (GGUF Q3_K_M, ~1.8 GB) ---
GGUF_DIR = REPO_DIR / "onnx_models" / "medgemma"
GGUF_DIR.mkdir(parents=True, exist_ok=True)

hf_hub_download(
    repo_id="unsloth/medgemma-4b-it-GGUF",
    filename="medgemma-4b-it-Q3_K_M.gguf",
    local_dir=str(GGUF_DIR),
)
print("✅ MedGemma model downloaded")



✅ MedASR model downloaded
✅ MedGemma model downloaded


In [5]:
##### DIAGNOSTIC CELL — run this after Cell 3 (download) to check what's happening
import os
from pathlib import Path

REPO_DIR = Path("/kaggle/working/OR-Simulation")

# 1. Check where the GGUF file actually is
print("=== Searching for .gguf files ===")
for root, dirs, files in os.walk("/kaggle/working"):
    for f in files:
        if f.endswith(".gguf"):
            full = os.path.join(root, f)
            sz = os.path.getsize(full)
            print(f"  {full}  ({sz / 1e9:.2f} GB)")

# 2. Check expected path
expected = REPO_DIR / "onnx_models" / "medgemma" / "medgemma-4b-it-Q3_K_M.gguf"
print(f"\nExpected path: {expected}")
print(f"  Exists: {expected.exists()}")
if expected.exists():
    print(f"  Size: {expected.stat().st_size / 1e9:.2f} GB")

# 3. Check llama-cpp GPU support
import llama_cpp
lib = getattr(llama_cpp, "llama_cpp", None)
gpu_ok = lib.llama_supports_gpu_offload() if lib else False
print(f"\nllama-cpp GPU offload: {gpu_ok}")

# 4. Try loading with 0 GPU layers
if expected.exists():
    print("\nAttempting load with n_gpu_layers=-1...")
    try:
        from llama_cpp import Llama
        m = Llama(model_path=str(expected), n_ctx=512, n_gpu_layers=-1, verbose=False)
        print("✅ Load succeeded (GPU mode)")
        # del m
    except Exception as e:
        print(f"❌ Load failed: {e}")

=== Searching for .gguf files ===
  /kaggle/working/OR-Simulation/onnx_models/medgemma/medgemma-4b-it-Q3_K_M.gguf  (2.10 GB)

Expected path: /kaggle/working/OR-Simulation/onnx_models/medgemma/medgemma-4b-it-Q3_K_M.gguf
  Exists: True
  Size: 2.10 GB

llama-cpp GPU offload: True

Attempting load with n_gpu_layers=-1...


llama_context: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified_iswa: using full-size SWA cache (ref: https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)


✅ Load succeeded (GPU mode)


In [6]:
import os
os.environ["LLAMA_LOG_LEVEL"] = "ERROR"

In [7]:
import threading, time, uvicorn, urllib.request

LLM_MODEL = GGUF_DIR / "medgemma-4b-it-Q3_K_M.gguf"
ASR_MODEL = ONNX_DIR / "model.int8.onnx"

os.environ["LLM_STUB"] = "0" if LLM_MODEL.exists() else "1"
os.environ["LLM_REAL"] = "1" if LLM_MODEL.exists() else "0"

from src.api.app import app

PORT = 8000
threading.Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=PORT, log_level="info"), daemon=True).start()
time.sleep(8)

resp = urllib.request.urlopen(f"http://127.0.0.1:{PORT}/health")
print(f"✅ Server running: {resp.read().decode()}")




INFO:     Started server process [290]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:59386 - "GET /health HTTP/1.1" 200 OK
✅ Server running: {"status":"ok","timestamp":"2026-02-22T17:21:03.042490+00:00","surgery_loaded":"PCNL","disclaimer":"OR-Symphony is a SIMULATION and RESEARCH system only. It does NOT control real medical devices. All outputs are suggestions requiring human confirmation."}


In [8]:
from pyngrok import ngrok
from kaggle_secrets import UserSecretsClient

secrets = UserSecretsClient()
ngrok.set_auth_token(secrets.get_secret("NGROK_TOKEN"))
tunnel = ngrok.connect(PORT, "http")
PUBLIC_URL = tunnel.public_url

print("=" * 60)
print(f"  BACKEND URL: {PUBLIC_URL}")
print(f"  FRONTEND:    http://localhost:3000/?backend={PUBLIC_URL}")
print("=" * 60)

  BACKEND URL: https://unhurting-nonmediative-deegan.ngrok-free.dev
  FRONTEND:    http://localhost:3000/?backend=https://unhurting-nonmediative-deegan.ngrok-free.dev


In [9]:
import urllib.request
try:
    while True:
        time.sleep(30)
        try:
            urllib.request.urlopen(f"http://127.0.0.1:{PORT}/health", timeout=5)
        except: pass
except KeyboardInterrupt:
    ngrok.disconnect(PUBLIC_URL)

INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /health HTTP/1.1" 200 OK


INFO:     ('2409:408c:1b8b:234c:5050:a6fd:91e:2e9c', 0) - "WebSocket /ws/state" [accepted]
INFO:     connection open


INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /health HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /health HTTP/1.1" 200 OK


INFO:     ('2409:408c:1b8b:234c:5050:a6fd:91e:2e9c', 0) - "WebSocket /ws/state" [accepted]
INFO:     connection open
INFO:     ('157.48.152.254', 0) - "WebSocket /ws/state" [accepted]
INFO:     connection open


INFO:     127.0.0.1:40356 - "GET /health HTTP/1.1" 200 OK


INFO:     connection closed
INFO:     connection closed
INFO:     connection closed


INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /machines HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /surgeries HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /health HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /surgeries HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /machines HTTP/1.1" 200 OK


INFO:     ('2409:408c:1b8b:234c:5050:a6fd:91e:2e9c', 0) - "WebSocket /ws/state" [accepted]
INFO:     connection open
INFO:     ('2409:408c:1b8b:234c:5050:a6fd:91e:2e9c', 0) - "WebSocket /ws/audio" [accepted]
INFO:     connection open
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs

PyngrokNgrokURLError: ngrok client exception, URLError: [Errno 111] Connection refused