From d6477926fe93c58fdb7bdbdfb59dce44f665792d Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 19 Mar 2026 14:11:40 +0000 Subject: [PATCH 1/2] refactor: deduplicate KMS auth helpers between onboard and upgrade_authority Move shared helper functions (dstack_client, app_attest, pad64, ensure_self_kms_allowed, ensure_kms_allowed) into upgrade_authority.rs and reuse them from onboard_service.rs. Remove the no-op ensure_remote_kms_allowed wrapper. --- kms/src/main_service/upgrade_authority.rs | 40 ++++++++++++-- kms/src/onboard_service.rs | 63 ++--------------------- 2 files changed, 40 insertions(+), 63 deletions(-) diff --git a/kms/src/main_service/upgrade_authority.rs b/kms/src/main_service/upgrade_authority.rs index d2b64016..6b7ace06 100644 --- a/kms/src/main_service/upgrade_authority.rs +++ b/kms/src/main_service/upgrade_authority.rs @@ -2,7 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 -use crate::config::AuthApi; +use crate::config::{AuthApi, KmsConfig}; use anyhow::{bail, Context, Result}; use dstack_guest_agent_rpc::{ dstack_guest_client::DstackGuestClient, AttestResponse, RawQuoteArgs, @@ -188,19 +188,51 @@ fn url_join(url: &str, path: &str) -> String { url } -fn dstack_client() -> DstackGuestClient { +pub(crate) fn dstack_client() -> DstackGuestClient { let address = dstack_types::dstack_agent_address(); let http_client = PrpcClient::new(address); DstackGuestClient::new(http_client) } -async fn app_attest(report_data: Vec) -> Result { +pub(crate) async fn app_attest(report_data: Vec) -> Result { dstack_client().attest(RawQuoteArgs { report_data }).await } -fn pad64(hash: [u8; 32]) -> Vec { +pub(crate) fn pad64(hash: [u8; 32]) -> Vec { let mut padded = Vec::with_capacity(64); padded.extend_from_slice(&hash); padded.resize(64, 0); padded } + +pub(crate) async fn ensure_self_kms_allowed(cfg: &KmsConfig) -> Result<()> { + let boot_info = local_kms_boot_info(cfg.pccs_url.as_deref()) + .await + .context("failed to build local KMS boot info")?; + let response = cfg + .auth_api + .is_app_allowed(&boot_info, true) + .await + .context("failed to call KMS auth check")?; + if !response.is_allowed { + bail!("boot denied: {}", response.reason); + } + Ok(()) +} + +pub(crate) async fn ensure_kms_allowed( + cfg: &KmsConfig, + attestation: &VerifiedAttestation, +) -> Result<()> { + let boot_info = build_boot_info(attestation, false, "") + .context("failed to build KMS boot info from attestation")?; + let response = cfg + .auth_api + .is_app_allowed(&boot_info, true) + .await + .context("failed to call KMS auth check")?; + if !response.is_allowed { + bail!("boot denied: {}", response.reason); + } + Ok(()) +} diff --git a/kms/src/onboard_service.rs b/kms/src/onboard_service.rs index 93eeb562..64b2390b 100644 --- a/kms/src/onboard_service.rs +++ b/kms/src/onboard_service.rs @@ -5,9 +5,6 @@ use std::sync::{Arc, Mutex}; use anyhow::{bail, Context, Result}; -use dstack_guest_agent_rpc::{ - dstack_guest_client::DstackGuestClient, AttestResponse, RawQuoteArgs, -}; use dstack_kms_rpc::{ kms_client::KmsClient, onboard_server::{OnboardRpc, OnboardServer}, @@ -15,7 +12,6 @@ use dstack_kms_rpc::{ OnboardResponse, }; use fs_err as fs; -use http_client::prpc::PrpcClient; use k256::ecdsa::SigningKey; use ra_rpc::{ client::{CertInfo, RaClient, RaClientConfig}, @@ -30,7 +26,9 @@ use safe_write::safe_write; use crate::{ config::KmsConfig, - main_service::upgrade_authority::{build_boot_info, local_kms_boot_info}, + main_service::upgrade_authority::{ + app_attest, dstack_client, ensure_kms_allowed, ensure_self_kms_allowed, pad64, + }, }; #[derive(Clone)] @@ -260,7 +258,7 @@ impl Keys { .map_err(|_| anyhow::anyhow!("source attestation mutex poisoned"))? .clone() .context("Missing source KMS attestation")?; - ensure_remote_kms_allowed(cfg, &source_attestation) + ensure_kms_allowed(cfg, &source_attestation) .await .context("Source KMS is not allowed for onboarding")?; @@ -349,52 +347,6 @@ pub(crate) async fn bootstrap_keys(cfg: &KmsConfig) -> Result<()> { Ok(()) } -fn dstack_client() -> DstackGuestClient { - let address = dstack_types::dstack_agent_address(); - let http_client = PrpcClient::new(address); - DstackGuestClient::new(http_client) -} - -async fn app_attest(report_data: Vec) -> Result { - dstack_client().attest(RawQuoteArgs { report_data }).await -} - -async fn ensure_self_kms_allowed(cfg: &KmsConfig) -> Result<()> { - let boot_info = local_kms_boot_info(cfg.pccs_url.as_deref()) - .await - .context("Failed to build local KMS boot info")?; - let response = cfg - .auth_api - .is_app_allowed(&boot_info, true) - .await - .context("Failed to call KMS auth check")?; - if !response.is_allowed { - bail!("Boot denied: {}", response.reason); - } - Ok(()) -} - -async fn ensure_remote_kms_allowed( - cfg: &KmsConfig, - attestation: &VerifiedAttestation, -) -> Result<()> { - ensure_kms_allowed(cfg, attestation).await -} - -async fn ensure_kms_allowed(cfg: &KmsConfig, attestation: &VerifiedAttestation) -> Result<()> { - let boot_info = build_boot_info(attestation, false, "") - .context("Failed to build KMS boot info from attestation")?; - let response = cfg - .auth_api - .is_app_allowed(&boot_info, true) - .await - .context("Failed to call KMS auth check")?; - if !response.is_allowed { - bail!("Boot denied: {}", response.reason); - } - Ok(()) -} - async fn attest_keys(p256_pubkey: &[u8], k256_pubkey: &[u8]) -> Result> { let p256_hex = hex::encode(p256_pubkey); let k256_hex = hex::encode(k256_pubkey); @@ -412,13 +364,6 @@ fn keccak256(msg: &[u8]) -> [u8; 32] { hasher.finalize().into() } -fn pad64(hash: [u8; 32]) -> Vec { - let mut padded = Vec::with_capacity(64); - padded.extend_from_slice(&hash); - padded.resize(64, 0); - padded -} - async fn gen_ra_cert(ca_cert_pem: String, ca_key_pem: String) -> Result<(String, String)> { use ra_tls::cert::CertRequest; use ra_tls::rcgen::{KeyPair, PKCS_ECDSA_P256_SHA256}; From c561a7f459737f9d57a9a8ee3c51f7bd4b440e9e Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Thu, 19 Mar 2026 15:18:51 +0000 Subject: [PATCH 2/2] docs: update KMS test guides with findings from integration run - Add QEMU user-mode networking note (host at 10.0.2.2 from CVM) - Document empty osImageHash in remote KMS attestation and the need for "0x" in osImages for receiver-side onboard checks - Recommend port forwarding over gateway for simpler test setup - Note that source_url must be CVM-reachable, not 127.0.0.1 - Update auth config templates with "0x" in osImages --- tests/docs/kms-bootstrap-onboard.md | 8 +++- tests/docs/kms-self-authorization.md | 55 ++++++++++++++++++---------- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/tests/docs/kms-bootstrap-onboard.md b/tests/docs/kms-bootstrap-onboard.md index 9b810160..3c242186 100644 --- a/tests/docs/kms-bootstrap-onboard.md +++ b/tests/docs/kms-bootstrap-onboard.md @@ -75,8 +75,10 @@ Operational notes: 1. Prefer a **prebuilt KMS image**. 2. `Boot Progress: done` does **not** guarantee the onboard endpoint is ready. 3. The onboarding completion endpoint is **GET `/finish`**. -4. On teepod, onboard mode usually uses the `-8000` URL, while runtime TLS KMS RPC usually uses the `-8000s` URL. +4. On teepod with gateway, onboard mode usually uses the `-8000` URL, while runtime TLS KMS RPC usually uses the `-8000s` URL. **Port forwarding** (`--port tcp:0.0.0.0::8000`) is simpler than gateway for testing, because gateway requires the auth API to return a `gatewayAppId` at boot time. 5. If you use a very small custom webhook instead of the real auth service, `KMS.GetMeta` may fail because `auth_api.get_info()` expects extra chain / contract metadata fields. In that case, use `GetTempCaCert` as the runtime readiness probe. +6. dstack CVMs use QEMU user-mode networking — the host is reachable at **`10.0.2.2`** from inside the CVM. The `source_url` in `Onboard.Onboard` must use a CVM-reachable address (e.g., `https://10.0.2.2:/prpc`), not `127.0.0.1`. +7. **Remote KMS attestation has an empty `osImageHash`.** When the receiver verifies the source KMS during onboard, the `osImageHash` is empty because `vm_config` is unavailable for remote attestation. Auth configs for receiver-side checks must include `"0x"` in the `osImages` array. --- @@ -99,7 +101,7 @@ Use two independently controllable auth services: They can be: -1. host-local if reachable by CVMs +1. **Preferred:** host-local, accessed from CVMs via `http://10.0.2.2:` (QEMU host gateway) 2. public services 3. sidecars inside each KMS deployment @@ -107,6 +109,8 @@ At minimum, both policies must allow the KMS instance they serve. During onboard For `auth-simple`, `kms.mrAggregated = []` is a deny-all policy for KMS. Add the current KMS MR values explicitly when switching a test from deny to allow. +Include `"0x"` in the `osImages` array for configs used in receiver-side onboard checks (see operational note 7 above). + ### 4.3 Deploy `kms-src` and `kms-dst` Deploy both KMS instances in onboard mode with: diff --git a/tests/docs/kms-self-authorization.md b/tests/docs/kms-self-authorization.md index 45448463..91c40a6f 100644 --- a/tests/docs/kms-self-authorization.md +++ b/tests/docs/kms-self-authorization.md @@ -11,18 +11,22 @@ The goal is to validate the following behaviors without depending on `kms/e2e/` This guide is written as a deployment-and-test runbook so an AI agent can follow it end-to-end. -> **Execution notes from a real run on teepod2 (2026-03-19):** +> **Execution notes from real runs on teepod2 (2026-03-19):** > > 1. Do **not** assume a host-local `auth-simple` instance is reachable from a CVM. In practice, the auth API must be: > - publicly reachable by the CVM, or > - deployed as a sidecar/internal service inside the same test environment. -> 2. For PR validation, prefer a **prebuilt KMS test image**. The run documented here used `cr.kvin.wang/dstack-kms:kms-auth-checks-157ad4ba`. +> - dstack CVMs use QEMU user-mode networking — the host is reachable at **`10.0.2.2`** from inside the CVM. +> 2. For PR validation, prefer a **prebuilt KMS test image**. > 3. `Boot Progress: done` only means the VM guest boot finished. It does **not** guarantee the KMS onboard endpoint is already ready. > 4. If you inject helper scripts through `docker-compose.yaml`, prefer inline `configs.content` over `configs.file` unless you have confirmed the extra files are copied into the deployment bundle. > 5. The onboard completion endpoint is **GET `/finish`**, not POST. > 6. Do **not** reuse a previously captured `mr_aggregated` across redeploys. Auth policies must be generated from the attestation of the **current** VM under test. > 7. KMS now always requires quote/attestation. For local development without TDX hardware, use `sdk/simulator` instead of trying to run a no-attestation KMS flow. > 8. For `auth-simple`, `kms.mrAggregated = []` is a deny-all policy for KMS. Use that as the baseline deny configuration, then add the measured KMS MR values for allow cases. +> 9. **Port forwarding is simpler than gateway for testing.** Using `--gateway` requires the auth API to return a valid `gatewayAppId`, which adds unnecessary complexity. Use `--port tcp:0.0.0.0::8000` instead. +> 10. **Remote KMS attestation has an empty `osImageHash`.** When the receiver verifies the source KMS during onboard, the `osImageHash` field in the attestation is empty (because `vm_config` is not available for the remote attestation). Auth configs for receiver-side checks must include `"0x"` in the `osImages` array to match this empty hash. +> 11. The `source_url` in the `Onboard.Onboard` request must use an address **reachable from inside the CVM** (e.g., `https://10.0.2.2:/prpc`), not `127.0.0.1` which is the CVM's own loopback. --- @@ -119,10 +123,10 @@ Strong recommendation for this manual test: Using a prebuilt image significantly reduces ambiguity when a failure happens: you can focus on KMS authorization logic rather than image build or registry behavior. -Teepod/gateway URL convention observed during a real run: +If you use teepod gateway instead of port forwarding: -- **onboard mode:** use the `-8000` style URL -- **runtime TLS KMS RPC after bootstrap/onboard:** use the `-8000s` style URL +- **onboard mode:** use the `-8000` style URL (plain HTTP) +- **runtime TLS KMS RPC after bootstrap/onboard:** use the `-8000s` style URL (TLS passthrough) Do not assume the same external URL works before and after onboarding is finished. @@ -144,9 +148,9 @@ The original plan was to run two host-local `auth-simple` processes. In practice Choose one of these options: -1. **Preferred:** deploy the auth API as a separate public service or CVM -2. **Also fine:** run the auth API as a sidecar in the same KMS test deployment -3. **Only if reachable:** run `auth-simple` on the operator host and point KMS at that reachable host/IP +1. **Preferred:** run `auth-simple` on the operator host and point KMS at `http://10.0.2.2:` (QEMU host gateway). This is the simplest if the CVMs use QEMU user-mode networking. +2. **Also fine:** deploy the auth API as a separate public service or CVM +3. **Sidecar:** run the auth API as a sidecar in the same KMS test deployment If you use the sidecar/public-service pattern, keep the same logical split: @@ -224,12 +228,17 @@ Requirements for **both** VMs: - `core.onboard.auto_bootstrap_domain = ""` - `core.auth_api.type = "webhook"` -Point them at different auth services or sidecars: +Point them at different auth services. If using host-local `auth-simple` with QEMU user-mode networking: -- `kms-src` → `http://:3101` -- `kms-dst` → `http://:3102` +- `kms-src` → `http://10.0.2.2:3101` +- `kms-dst` → `http://10.0.2.2:3102` -If you use sidecars instead of host-local auth servers, replace those URLs with the sidecar/internal service addresses. +**Recommended deploy method:** use port forwarding (`--port`) instead of gateway. Gateway requires the auth API to return a `gatewayAppId` at boot, which makes testing harder. With port forwarding, the KMS onboard and runtime endpoints are directly accessible on the host: + +```bash +vmm-cli.py deploy --name kms-src ... --port tcp:0.0.0.0:9301:8000 +vmm-cli.py deploy --name kms-dst ... --port tcp:0.0.0.0:9302:8000 +``` If you need an example deployment template, adapt the flow in: @@ -238,14 +247,18 @@ If you need an example deployment template, adapt the flow in: Record these values: ```bash -export KMS_SRC_ONBOARD='https:///' -export KMS_DST_ONBOARD='https:///' +# With port forwarding: +export KMS_SRC_ONBOARD='http://127.0.0.1:9301' +export KMS_DST_ONBOARD='http://127.0.0.1:9302' +export KMS_SRC_RUNTIME='https://127.0.0.1:9301' +export KMS_DST_RUNTIME='https://127.0.0.1:9302' ``` Notes: -- The onboard endpoint is plain onboarding mode, so use `Onboard.*` -- The runtime KMS endpoint is available only after bootstrap/onboard and `/finish` +- The onboard endpoint serves plain HTTP, so use `http://` for `KMS_*_ONBOARD` +- After bootstrap/onboard + `/finish`, the KMS restarts with TLS — use `https://` for `KMS_*_RUNTIME` +- The `source_url` in `Onboard.Onboard` must be reachable from inside the CVM (e.g., `https://10.0.2.2:9301/prpc`) Wait until the onboard endpoint is actually ready before continuing. A simple probe loop is recommended: @@ -300,12 +313,14 @@ All three values above are expected to be hex strings **without** the `0x` prefi #### Deny-by-MR config -Use a wrong `mrAggregated` value while allowing the observed OS image: +Use a wrong `mrAggregated` value while allowing the observed OS image. + +> **Important:** include `"0x"` in `osImages` to handle remote KMS attestation during onboard receiver-side checks, where `osImageHash` is empty because `vm_config` is unavailable for the remote attestation. ```bash cat > /tmp/kms-self-auth/deny-by-mr.json <<'EOF' { - "osImages": ["0xREPLACE_OS"], + "osImages": ["0xREPLACE_OS", "0x"], "gatewayAppId": "any", "kms": { "mrAggregated": ["0x0000000000000000000000000000000000000000000000000000000000000000"], @@ -322,7 +337,7 @@ EOF ```bash cat > /tmp/kms-self-auth/allow-single.json <<'EOF' { - "osImages": ["0xREPLACE_OS"], + "osImages": ["0xREPLACE_OS", "0x"], "gatewayAppId": "any", "kms": { "mrAggregated": ["0xREPLACE_MR"], @@ -339,7 +354,7 @@ EOF ```bash cat > /tmp/kms-self-auth/allow-src-and-dst.json <<'EOF' { - "osImages": ["0xREPLACE_SRC_OS", "0xREPLACE_DST_OS"], + "osImages": ["0xREPLACE_SRC_OS", "0xREPLACE_DST_OS", "0x"], "gatewayAppId": "any", "kms": { "mrAggregated": ["0xREPLACE_SRC_MR", "0xREPLACE_DST_MR"],