In [1]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
import subprocess, sys, os

# Suppress llama.cpp C-level logging BEFORE any import
os.environ["LLAMA_LOG_LEVEL"] = "ERROR"
os.environ["GGML_LOG_LEVEL"] = "error"

def pip_install(*pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + list(pkgs))

pip_install(
    "fastapi==0.115.0", "uvicorn[standard]==0.30.0", "pydantic>=2.0",
    "numpy>=1.24", "faster-whisper>=1.0.0", "soundfile>=0.12.1",
    "pyngrok==7.1.6", "huggingface_hub",
)
print("‚úÖ Core dependencies installed")

# llama-cpp-python >= 0.3.8 needed for Gemma 3 architecture
subprocess.check_call([
    sys.executable, "-m", "pip", "install", "-q",
    "llama-cpp-python>=0.3.8",
    "--extra-index-url",
    "https://abetlen.github.io/llama-cpp-python/whl/cu124",
])

# Verify GPU offload
import llama_cpp
lib = getattr(llama_cpp, "llama_cpp", None)
gpu_ok = lib.llama_supports_gpu_offload() if lib else False
print(f"‚úÖ llama-cpp-python {llama_cpp.__version__}, GPU offload: {gpu_ok}")
if not gpu_ok:
    print("‚ö†Ô∏è  No GPU offload ‚Äî LLM will be slow (CPU only)")

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    yes
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 2 CUDA devices:
  Device 0: Tesla T4, compute capability 7.5, VMM: yes
  Device 1: Tesla T4, compute capability 7.5, VMM: yes


‚úÖ llama-cpp-python 0.3.16, GPU offload: True


In [3]:
import os,subprocess,sys
from pathlib import Path

REPO_DIR = Path("/kaggle/working/OR-Simulation")
if not REPO_DIR.exists():
    subprocess.check_call(["git", "clone",
        "https://github.com/Aditya-Lingam-9000/OR-Simulation.git",
        str(REPO_DIR)])
else:
    subprocess.check_call(["git", "-C", str(REPO_DIR), "pull"])

os.chdir(REPO_DIR)
sys.path.insert(0, str(REPO_DIR))
print(f"‚úÖ Repo ready at {REPO_DIR}")

Already up to date.
‚úÖ Repo ready at /kaggle/working/OR-Simulation


In [None]:
from huggingface_hub import hf_hub_download
from pathlib import Path

REPO_DIR = Path("/kaggle/working/OR-Simulation")

# --- ASR: faster-whisper base.en ---
# No manual download needed ‚Äî faster-whisper auto-downloads from
# HuggingFace on first transcription (~140 MB, cached in ~/.cache/)
print("‚úÖ ASR: faster-whisper base.en (auto-downloads on first use)")

# --- MedGemma LLM (GGUF Q3_K_M, ~1.8 GB) ---
GGUF_DIR = REPO_DIR / "onnx_models" / "medgemma"
GGUF_DIR.mkdir(parents=True, exist_ok=True)

LLM_MODEL = GGUF_DIR / "medgemma-4b-it-Q3_K_M.gguf"
if not LLM_MODEL.exists():
    print("‚¨áÔ∏è  Downloading MedGemma GGUF (~1.8 GB)...")
    downloaded = hf_hub_download(
        repo_id="unsloth/medgemma-4b-it-GGUF",
        filename="medgemma-4b-it-Q3_K_M.gguf",
        local_dir=str(GGUF_DIR),
    )
    # Verify it landed in the right place
    import shutil
    if not LLM_MODEL.exists() and Path(downloaded).exists():
        shutil.copy2(downloaded, LLM_MODEL)

if LLM_MODEL.exists():
    sz = LLM_MODEL.stat().st_size
    print(f"‚úÖ MedGemma: {sz / 1e9:.2f} GB")
    if sz < 100_000_000:
        print(f"‚ö†Ô∏è  File suspiciously small ({sz} bytes) ‚Äî may be corrupt")
else:
    print("‚ùå MedGemma download failed ‚Äî check logs above")



‚úÖ MedASR model downloaded
‚úÖ MedGemma model downloaded


In [None]:
import threading, time, uvicorn, urllib.request

os.environ["LLM_STUB"] = "0" if LLM_MODEL.exists() else "1"
os.environ["LLM_REAL"] = "1" if LLM_MODEL.exists() else "0"

from src.api.app import app

PORT = 8000

threading.Thread(
    target=lambda: uvicorn.run(app, host="0.0.0.0", port=PORT, log_level="info"),
    daemon=True,
).start()
print(f"‚è≥ Starting server on port {PORT}...")
time.sleep(8)

try:
    resp = urllib.request.urlopen(f"http://127.0.0.1:{PORT}/health")
    print(f"‚úÖ Server running: {resp.read().decode()}")
except Exception as e:
    print(f"‚ùå Server health check failed: {e}")

INFO:     Started server process [290]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:59386 - "GET /health HTTP/1.1" 200 OK
‚úÖ Server running: {"status":"ok","timestamp":"2026-02-22T17:21:03.042490+00:00","surgery_loaded":"PCNL","disclaimer":"OR-Symphony is a SIMULATION and RESEARCH system only. It does NOT control real medical devices. All outputs are suggestions requiring human confirmation."}


In [8]:
from pyngrok import ngrok
from kaggle_secrets import UserSecretsClient

secrets = UserSecretsClient()
ngrok.set_auth_token(secrets.get_secret("NGROK_TOKEN"))
tunnel = ngrok.connect(PORT, "http")
PUBLIC_URL = tunnel.public_url

print("=" * 60)
print(f"  BACKEND URL: {PUBLIC_URL}")
print(f"  FRONTEND:    http://localhost:3000/?backend={PUBLIC_URL}")
print("=" * 60)

  BACKEND URL: https://unhurting-nonmediative-deegan.ngrok-free.dev
  FRONTEND:    http://localhost:3000/?backend=https://unhurting-nonmediative-deegan.ngrok-free.dev


In [None]:
print("üîÑ Server running. Keep this tab open.")
print(f"   Frontend: http://localhost:3000/?backend={PUBLIC_URL}\n")

try:
    while True:
        time.sleep(30)
        try:
            resp = urllib.request.urlopen(f"http://127.0.0.1:{PORT}/health", timeout=5)
            status = "‚úÖ alive"
        except Exception:
            status = "‚ö†Ô∏è check"
        print(f"[{time.strftime('%H:%M:%S')}] {status} | {PUBLIC_URL}")
except KeyboardInterrupt:
    print("\nüõë Shutting down...")
    ngrok.disconnect(PUBLIC_URL)
    print("Tunnel closed.")

INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /health HTTP/1.1" 200 OK


INFO:     ('2409:408c:1b8b:234c:5050:a6fd:91e:2e9c', 0) - "WebSocket /ws/state" [accepted]
INFO:     connection open


INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /health HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /health HTTP/1.1" 200 OK


INFO:     ('2409:408c:1b8b:234c:5050:a6fd:91e:2e9c', 0) - "WebSocket /ws/state" [accepted]
INFO:     connection open
INFO:     ('157.48.152.254', 0) - "WebSocket /ws/state" [accepted]
INFO:     connection open


INFO:     127.0.0.1:40356 - "GET /health HTTP/1.1" 200 OK


INFO:     connection closed
INFO:     connection closed
INFO:     connection closed


INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /machines HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /surgeries HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /health HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /surgeries HTTP/1.1" 200 OK
INFO:     2409:408c:1b8b:234c:5050:a6fd:91e:2e9c:0 - "GET /machines HTTP/1.1" 200 OK


INFO:     ('2409:408c:1b8b:234c:5050:a6fd:91e:2e9c', 0) - "WebSocket /ws/state" [accepted]
INFO:     connection open
INFO:     ('2409:408c:1b8b:234c:5050:a6fd:91e:2e9c', 0) - "WebSocket /ws/audio" [accepted]
INFO:     connection open
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs or the model.), skipping
Window decode failed (Invalid rank for input: mask Got: 1 Expected: 2 Please fix either the inputs/outputs

PyngrokNgrokURLError: ngrok client exception, URLError: [Errno 111] Connection refused