diff --git a/.gitignore b/.gitignore index a57cf7a..4cdba61 100755 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ pdfs/ docs/rendered/ discussions/ .DS_Store +.gstack/ diff --git a/Cargo.lock b/Cargo.lock index 3c50a51..e6d205c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -861,6 +861,7 @@ dependencies = [ "extenddb-server", "extenddb-storage", "extenddb-storage-postgres", + "extenddb-storage-tidb", "libc", "rand 0.9.4", "rcgen", @@ -1003,6 +1004,35 @@ dependencies = [ "zeroize", ] +[[package]] +name = "extenddb-storage-tidb" +version = "0.1.0" +dependencies = [ + "aes-gcm", + "anyhow", + "async-trait", + "base64 0.22.1", + "bcrypt", + "crc32fast", + "extenddb-auth", + "extenddb-core", + "extenddb-storage", + "futures", + "inventory", + "percent-encoding", + "rand 0.9.4", + "serde", + "serde_json", + "sqlx", + "time", + "tokio", + "toml", + "tracing", + "url", + "uuid", + "zeroize", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" diff --git a/Cargo.toml b/Cargo.toml index b558112..c9dfd07 100755 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ members = [ "crates/engine", "crates/storage", "crates/storage-postgres", + "crates/storage-tidb", "crates/auth", "crates/server", "crates/bin", @@ -24,6 +25,7 @@ extenddb-core = { path = "crates/core" } extenddb-engine = { path = "crates/engine" } extenddb-storage = { path = "crates/storage" } extenddb-storage-postgres = { path = "crates/storage-postgres" } +extenddb-storage-tidb = { path = "crates/storage-tidb" } extenddb-auth = { path = "crates/auth" } extenddb-server = { path = "crates/server" } @@ -51,7 +53,7 @@ hyper = { version = "1" } inventory = "0.3" # Database -sqlx = { version = "0.8", features = ["runtime-tokio-rustls", "postgres", "json", "time", "uuid", "bigdecimal"] } +sqlx = { version = "0.8", features = ["runtime-tokio-rustls", "postgres", "mysql", "json", "time", "uuid", "bigdecimal"] } # Crypto & checksums crc32fast = "1" @@ -85,6 +87,8 @@ metrics-exporter-prometheus = "0.16" # Config clap = { version = "4", features = ["derive"] } config = "0.14" +percent-encoding = "2" +url = "2" # Security zeroize = { version = "1.8", features = ["derive"] } diff --git a/README.md b/README.md index 8831a1d..61b1523 100755 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ A DynamoDB-compatible API adapter, ExtendDB speaks the DynamoDB wire protocol - **Local development** — run DynamoDB workloads on your laptop with zero cloud dependency - **CI/CD pipelines** — deterministic integration tests against a DynamoDB-compatible backend - **Self-hosted deployments** — run DynamoDB workloads on your own infrastructure (on-premises, private cloud, edge) -- **Multi-cloud** — use DynamoDB semantics on any cloud that runs PostgreSQL +- **Multi-cloud** — use DynamoDB semantics on any cloud that runs a supported storage backend - **Air-gapped environments** — DynamoDB functionality with no internet connectivity ## Features @@ -21,7 +21,7 @@ A DynamoDB-compatible API adapter, ExtendDB speaks the DynamoDB wire protocol - CSRF protection, security headers, session management - Prometheus-compatible metrics endpoint - Daemon mode with syslog logging -- PostgreSQL storage — use standard backup, replication, and HA tools +- Pluggable storage backends — PostgreSQL by default, TiDB as an optional in-tree backend ## Quick Start @@ -50,7 +50,7 @@ scripts/install-macos.sh # macOS ## Prerequisites - Rust 1.85+ (`rustup update`) -- PostgreSQL 14+ (see `docs/local-postgres-setup.md`) +- A supported storage backend: PostgreSQL 14+ by default, or TiDB when building with the `tidb` feature - Python 3.10+ (for test suites and documentation) ### Python Environment @@ -168,6 +168,7 @@ crates/ engine/ — operation handlers storage/ — storage trait definitions storage-postgres/ — PostgreSQL backend + storage-tidb/ — TiDB backend auth/ — SigV4 verification, IAM policy engine server/ — HTTP server, management API, web console bin/ — CLI, config, daemon lifecycle diff --git a/crates/bin/Cargo.toml b/crates/bin/Cargo.toml index 2a2c646..15ef8dd 100755 --- a/crates/bin/Cargo.toml +++ b/crates/bin/Cargo.toml @@ -14,12 +14,14 @@ path = "src/main.rs" [features] default = ["postgres"] postgres = ["extenddb-storage-postgres"] +tidb = ["extenddb-storage-tidb"] [dependencies] extenddb-core = { workspace = true } extenddb-engine = { workspace = true } extenddb-storage = { workspace = true } extenddb-storage-postgres = { workspace = true, optional = true } +extenddb-storage-tidb = { workspace = true, optional = true } extenddb-auth = { workspace = true } extenddb-server = { workspace = true } tokio = { workspace = true } diff --git a/crates/bin/src/cmd_destroy.rs b/crates/bin/src/cmd_destroy.rs index 3c2c24d..fa4f7af 100755 --- a/crates/bin/src/cmd_destroy.rs +++ b/crates/bin/src/cmd_destroy.rs @@ -6,6 +6,7 @@ //! Reads config, enumerates tables, requires `--yes` to confirm, drops both databases. use clap::Args; +use extenddb_storage::bootstrapper::BootstrapOptions; use crate::config; @@ -16,13 +17,13 @@ pub struct DestroyArgs { #[arg(short, long, default_value = "extenddb.toml")] config: String, - /// PostgreSQL admin user (for DROP DATABASE) - #[arg(long, default_value_t = config::whoami("postgres"))] - pg_user: String, + /// Storage admin user (for DROP DATABASE) + #[arg(long = "storage-admin-user")] + storage_admin_user: Option, - /// PostgreSQL admin password - #[arg(long)] - pg_pass: Option, + /// Storage admin password + #[arg(long = "storage-admin-password")] + storage_admin_password: Option, /// Confirm destruction (required, no interactive prompt) #[arg(long)] @@ -40,16 +41,23 @@ pub async fn run(args: DestroyArgs) -> anyhow::Result<()> { let app_config = config::load(&args.config)?; let backend = &app_config.storage._backend; - // Collect CLI args for backend-specific parsing - let cli_args: Vec = std::env::args().collect(); + let bootstrap_options = BootstrapOptions { + admin_user: args.storage_admin_user.clone(), + admin_password: args.storage_admin_password.clone(), + ..BootstrapOptions::default() + }; println!("=== extenddb destroy ==="); println!("Config: {}", args.config); println!(); // Create bootstrap store for catalog queries and database teardown. - let bootstrap = - extenddb_storage::bootstrapper::create_bootstrapper(backend, &args.config, &cli_args).await; + let bootstrap = extenddb_storage::bootstrapper::create_bootstrapper( + backend, + &args.config, + bootstrap_options.clone(), + ) + .await; let mut data_db = String::new(); @@ -92,8 +100,7 @@ pub async fn run(args: DestroyArgs) -> anyhow::Result<()> { } // For drop, we need a fresh bootstrap store connected as admin (not to the - // catalog DB we're about to drop). The existing bootstrap store's admin pool - // connects to the `postgres` database, so we can reuse it. + // catalog DB we're about to drop). if !data_db.is_empty() { // Defense-in-depth: validate even though this came from the catalog. config::validate_identifier(backend, &data_db, "data database name")?; @@ -102,10 +109,13 @@ pub async fn run(args: DestroyArgs) -> anyhow::Result<()> { // Reconnect as admin for DDL operations (the catalog pool must be dropped // before we can DROP DATABASE). drop(bootstrap); - let bootstrap = - extenddb_storage::bootstrapper::create_bootstrapper(backend, &args.config, &cli_args) - .await - .map_err(|e| anyhow::anyhow!("Cannot connect as admin: {e:?}"))?; + let bootstrap = extenddb_storage::bootstrapper::create_bootstrapper( + backend, + &args.config, + bootstrap_options, + ) + .await + .map_err(|e| anyhow::anyhow!("Cannot connect as admin: {e:?}"))?; bootstrap .drop_databases(&data_db) diff --git a/crates/bin/src/cmd_init.rs b/crates/bin/src/cmd_init.rs index 444f1a3..24ad6f9 100755 --- a/crates/bin/src/cmd_init.rs +++ b/crates/bin/src/cmd_init.rs @@ -9,6 +9,7 @@ use std::path::Path; use clap::Args; +use extenddb_storage::bootstrapper::BootstrapOptions; use crate::config; use crate::init_helpers::{generate_config, generate_tls_cert_if_needed}; @@ -16,8 +17,8 @@ use crate::init_helpers::{generate_config, generate_tls_cert_if_needed}; #[derive(Args)] #[allow(clippy::doc_markdown)] // Clap help text, not rustdoc pub struct InitArgs { - /// Storage backend (postgres) (default: postgres) - #[arg(long, default_value = "postgres")] + /// Storage backend name + #[arg(long)] backend: Option, /// Data database name (default: extenddb) @@ -28,21 +29,21 @@ pub struct InitArgs { #[arg(long)] catalog_db: Option, - /// PostgreSQL host - #[arg(long)] - pg_host: Option, + /// Storage host + #[arg(long = "storage-host")] + storage_host: Option, - /// PostgreSQL port - #[arg(long)] - pg_port: Option, + /// Storage port + #[arg(long = "storage-port")] + storage_port: Option, - /// PostgreSQL admin user (for CREATE DATABASE) - #[arg(long)] - pg_user: Option, + /// Storage admin user (for CREATE DATABASE) + #[arg(long = "storage-admin-user")] + storage_admin_user: Option, - /// PostgreSQL admin password (required for remote/Aurora connections). - #[arg(long)] - pg_pass: Option, + /// Storage admin password (required for remote connections). + #[arg(long = "storage-admin-password")] + storage_admin_password: Option, /// extenddb application user #[arg(long)] @@ -114,14 +115,14 @@ fn discover_docs_dir() -> Option { /// Returns exit code: 0 = success, 255 = existing config preserved. pub async fn run(args: InitArgs) -> anyhow::Result { - // Determine backend: CLI flag > config file > default + // Determine backend: CLI flag > config file > compiled default. let backend = if let Some(ref b) = args.backend { b.clone() } else if Path::new(&args.config).exists() { let app_config = config::load(&args.config)?; app_config.storage._backend } else { - "postgres".to_owned() + config::default_backend() }; println!("=== extenddb init (backend: {backend}) ==="); @@ -136,14 +137,23 @@ pub async fn run(args: InitArgs) -> anyhow::Result { return Ok(255); } - // Collect CLI args for backend-specific parsing - let cli_args: Vec = std::env::args().collect(); - // Create bootstrapper via registry (no hardcoded match!) - let bootstrapper = - extenddb_storage::bootstrapper::create_bootstrapper(&backend, &args.config, &cli_args) - .await - .map_err(|e| anyhow::anyhow!("{e:?}"))?; + let bootstrapper = extenddb_storage::bootstrapper::create_bootstrapper( + &backend, + &args.config, + BootstrapOptions { + storage_host: args.storage_host.clone(), + storage_port: args.storage_port, + admin_user: args.storage_admin_user.clone(), + admin_password: args.storage_admin_password.clone(), + data_db: args.data_db.clone(), + catalog_db: args.catalog_db.clone(), + app_user: args.extenddb_user.clone(), + app_password: args.extenddb_pass.clone(), + }, + ) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; // Ensure application user exists. bootstrapper @@ -234,9 +244,10 @@ pub async fn run(args: InitArgs) -> anyhow::Result { ); } - // Extract bind_addr from CLI args - let bind_addr = - extract_arg(&cli_args, "--bind-addr").unwrap_or_else(|| "127.0.0.1".to_string()); + let bind_addr = args + .bind_addr + .clone() + .unwrap_or_else(|| "127.0.0.1".to_string()); // Generate self-signed TLS certificate if not already present. // Include the server bind address as a SAN so the cert matches the URL. @@ -261,7 +272,13 @@ pub async fn run(args: InitArgs) -> anyhow::Result { if Path::new(config_path).exists() { std::fs::remove_file(config_path)?; } - generate_config(config_path, &catalog_url, &bind_addr, docs_dir.as_deref())?; + generate_config( + config_path, + &backend, + &catalog_url, + &bind_addr, + docs_dir.as_deref(), + )?; println!( "\n=== extenddb init complete ===\nStart the server with: extenddb serve --config {config_path}" @@ -269,8 +286,3 @@ pub async fn run(args: InitArgs) -> anyhow::Result { Ok(0) } - -/// Extract a CLI argument value by flag name. -fn extract_arg(args: &[String], flag: &str) -> Option { - args.windows(2).find(|w| w[0] == flag).map(|w| w[1].clone()) -} diff --git a/crates/bin/src/cmd_migrate.rs b/crates/bin/src/cmd_migrate.rs index 300d790..9dc4a21 100755 --- a/crates/bin/src/cmd_migrate.rs +++ b/crates/bin/src/cmd_migrate.rs @@ -6,6 +6,7 @@ //! Reads current catalog version, runs pending migrations, and reports the result. use clap::Args; +use extenddb_storage::bootstrapper::BootstrapOptions; use crate::config; @@ -15,13 +16,13 @@ pub struct MigrateArgs { #[arg(short, long, default_value = "extenddb.toml")] config: String, - /// `PostgreSQL` admin user (for catalog migrations) - #[arg(long)] - pg_user: Option, + /// Storage admin user (for catalog migrations) + #[arg(long = "storage-admin-user")] + storage_admin_user: Option, - /// `PostgreSQL` admin password - #[arg(long)] - pg_pass: Option, + /// Storage admin password + #[arg(long = "storage-admin-password")] + storage_admin_password: Option, /// Confirm migration (required, no interactive prompt) #[arg(long)] @@ -43,14 +44,18 @@ pub async fn run(args: MigrateArgs) -> anyhow::Result<()> { println!("Config: {}", args.config); println!(); - // Collect CLI args for backend-specific parsing - let cli_args: Vec = std::env::args().collect(); - // Create bootstrapper via registry - let bootstrap = - extenddb_storage::bootstrapper::create_bootstrapper(backend, &args.config, &cli_args) - .await - .map_err(|e| anyhow::anyhow!("{e:?}"))?; + let bootstrap = extenddb_storage::bootstrapper::create_bootstrapper( + backend, + &args.config, + BootstrapOptions { + admin_user: args.storage_admin_user.clone(), + admin_password: args.storage_admin_password.clone(), + ..BootstrapOptions::default() + }, + ) + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; // Show current version. println!("--- Checking current catalog version..."); @@ -62,9 +67,12 @@ pub async fn run(args: MigrateArgs) -> anyhow::Result<()> { println!(" Current version: {current_display}"); let expected = bootstrap.expected_catalog_version(); - if current.as_deref() == Some(expected.as_str()) { + let catalog_version_matches = current.as_deref() == Some(expected.as_str()); + if catalog_version_matches && !args.yes { println!(); - println!("Catalog is up to date (version {expected}). No migrations needed."); + println!( + "Catalog version is current ({expected}). Use --yes to check and apply backend schema migrations." + ); return Ok(()); } @@ -79,6 +87,10 @@ pub async fn run(args: MigrateArgs) -> anyhow::Result<()> { .run_catalog_migrations() .await .map_err(|e| anyhow::anyhow!("{e:?}"))?; + bootstrap + .run_data_migrations() + .await + .map_err(|e| anyhow::anyhow!("{e:?}"))?; // Read new version. let new = bootstrap diff --git a/crates/bin/src/cmd_serve.rs b/crates/bin/src/cmd_serve.rs index 49e51cc..ae13c15 100755 --- a/crates/bin/src/cmd_serve.rs +++ b/crates/bin/src/cmd_serve.rs @@ -67,17 +67,14 @@ pub fn run(args: &ServeArgs) -> anyhow::Result<()> { ); } - // Check backend is supported by this build + // Check backend is supported by this build. let backend = &app_config.storage._backend; - #[cfg(not(feature = "postgres"))] - if backend == "postgres" { - anyhow::bail!("PostgreSQL backend not enabled. Rebuild with --features postgres"); - } - #[cfg(feature = "postgres")] - if backend != "postgres" { + let available_backends = extenddb_storage::operations::list_operations_backends(); + if !available_backends.iter().any(|b| b == backend) { anyhow::bail!( - "Unknown backend '{}'. This build only supports 'postgres'.", - backend + "Unknown backend '{}'. This build supports: {}.", + backend, + available_backends.join(", ") ); } @@ -175,8 +172,8 @@ async fn serve( run_dir: String, ) -> anyhow::Result<()> { // CB-27: Clean up PID file if serve() fails before reaching the HTTP - // server (e.g., Postgres connection failure). The PID file was already - // written by Daemonize in run(). + // server (for example, storage connection failure). The PID file was + // already written by Daemonize in run(). let pid_path = pid_file_path(&run_dir, port); let backend = app_config.storage._backend.clone(); let result = serve_inner(app_config, std_listener, port, run_dir, backend).await; @@ -243,9 +240,11 @@ async fn serve_inner( } // Create server components via factory pattern + let runtime_storage_config = + config::RuntimeStorageConfig::new(app_config.storage.as_trait(), &app_config.limits); let components = extenddb_storage::create_server_components( &backend, - app_config.storage.as_trait(), + &runtime_storage_config, &app_config.server.region, ) .await?; diff --git a/crates/bin/src/config.rs b/crates/bin/src/config.rs index 01e0d81..5e23f37 100755 --- a/crates/bin/src/config.rs +++ b/crates/bin/src/config.rs @@ -113,7 +113,7 @@ impl Default for TlsConfig { #[derive(Debug, Clone)] pub struct StorageConfig { - /// Storage backend selector (e.g. "postgres"). + /// Storage backend selector. pub _backend: String, /// Backend-specific configuration (trait object). config: Box, @@ -144,6 +144,89 @@ impl StorageConfig { } } +/// Storage config view enriched with runtime limits from the top-level config. +#[derive(Debug)] +pub struct RuntimeStorageConfig<'a> { + base: &'a dyn extenddb_storage::config::StorageConfig, + limits: LimitsConfig, +} + +impl<'a> RuntimeStorageConfig<'a> { + pub fn new( + base: &'a dyn extenddb_storage::config::StorageConfig, + limits: &LimitsConfig, + ) -> Self { + Self { + base, + limits: limits.clone(), + } + } +} + +#[derive(Debug)] +struct OwnedRuntimeStorageConfig { + base: Box, + limits: LimitsConfig, +} + +impl extenddb_storage::config::StorageConfig for RuntimeStorageConfig<'_> { + fn connection_config(&self) -> &str { + self.base.connection_config() + } + + fn max_connections(&self) -> u32 { + self.base.max_connections() + } + + fn max_catalog_connections(&self) -> u32 { + self.base.max_catalog_connections() + } + + fn runtime_limits(&self) -> Option<&LimitsConfig> { + Some(&self.limits) + } + + fn native_backup_config(&self) -> Option { + self.base.native_backup_config() + } + + fn clone_box(&self) -> Box { + Box::new(OwnedRuntimeStorageConfig { + base: self.base.clone_box(), + limits: self.limits.clone(), + }) + } +} + +impl extenddb_storage::config::StorageConfig for OwnedRuntimeStorageConfig { + fn connection_config(&self) -> &str { + self.base.connection_config() + } + + fn max_connections(&self) -> u32 { + self.base.max_connections() + } + + fn max_catalog_connections(&self) -> u32 { + self.base.max_catalog_connections() + } + + fn runtime_limits(&self) -> Option<&LimitsConfig> { + Some(&self.limits) + } + + fn native_backup_config(&self) -> Option { + self.base.native_backup_config() + } + + fn clone_box(&self) -> Box { + Box::new(Self { + base: self.base.clone_box(), + limits: self.limits.clone(), + }) + } +} + impl<'de> serde::Deserialize<'de> for StorageConfig { fn deserialize(deserializer: D) -> Result where @@ -158,10 +241,9 @@ impl<'de> serde::Deserialize<'de> for StorageConfig { let backend = value .get("backend") .and_then(|v| v.as_str()) - .unwrap_or("postgres") - .to_string(); + .map_or_else(default_backend, str::to_owned); - // Get the backend-specific table (e.g., [storage.postgres]) + // Get the backend-specific table matching `backend`. let backend_table: &toml::Table = value .get(&backend) .and_then(|v| v.as_table()) @@ -182,9 +264,14 @@ impl<'de> serde::Deserialize<'de> for StorageConfig { impl Default for StorageConfig { fn default() -> Self { + let backend = default_backend(); + let config = + extenddb_storage::config::default_storage_config(&backend).unwrap_or_else(|e| { + panic!("Default storage backend '{backend}' is not registered: {e}") + }); Self { - _backend: default_backend(), - config: Box::new(extenddb_storage_postgres::PostgresStorageConfig::default()), + _backend: backend, + config, } } } @@ -253,8 +340,10 @@ pub fn expand_tilde(path: &str) -> String { } path.to_owned() } -fn default_backend() -> String { - "postgres".to_owned() +pub(crate) fn default_backend() -> String { + extenddb_storage::config::default_backend_name() + .unwrap_or_else(|e| panic!("No default storage backend registered: {e}")) + .to_owned() } fn default_tls_enabled() -> bool { true @@ -298,17 +387,12 @@ pub fn load(config_path: &str) -> anyhow::Result { /// Redact password from a connection string for safe logging (REQ-LOG-002). /// /// Uses the backend-specific operations engine to handle different connection -/// string formats (`PostgreSQL`). +/// string formats. pub fn redact_password(backend: &str, conn: &str) -> String { extenddb_storage::operations::redact_connection_string(backend, conn) .unwrap_or_else(|_| conn.to_owned()) } -/// Return the current OS username, falling back to given default username: e.g. `"postgres"`. -pub fn whoami(default: &str) -> String { - std::env::var("USER").unwrap_or_else(|_| default.to_owned()) -} - /// Validate that a string is safe to use as a database identifier for DDL. /// /// Delegates to the backend-specific operations engine. Rejects strings diff --git a/crates/bin/src/init_helpers.rs b/crates/bin/src/init_helpers.rs index 9b9d619..a635de6 100755 --- a/crates/bin/src/init_helpers.rs +++ b/crates/bin/src/init_helpers.rs @@ -67,6 +67,7 @@ pub fn generate_tls_cert_if_needed(bind_addr: &str) -> anyhow::Result<()> { /// All other settings are commented out with their defaults. pub(crate) fn generate_config( config_path: &str, + backend: &str, catalog_url: &str, bind_addr: &str, docs_dir: Option<&str>, @@ -89,6 +90,9 @@ pub(crate) fn generate_config( "# docs_dir = \"/path/to/docs/rendered\" # Path to rendered documentation\n".to_owned() } }; + let connection_scheme = catalog_url + .split_once("://") + .map_or("storage", |(scheme, _)| scheme); let toml = format!( r#"# Generated by extenddb init on {timestamp} @@ -98,7 +102,7 @@ pub(crate) fn generate_config( # # Environment variable overrides use the EXTENDDB__ prefix with __ as separator: # EXTENDDB__SERVER__PORT=9000 -# EXTENDDB__STORAGE__POSTGRES__CONNECTION_STRING="postgresql://..." +# EXTENDDB__STORAGE__{backend_env}__CONNECTION_STRING="{connection_scheme}://..." {docs_line} [server] @@ -115,9 +119,9 @@ cert_path = "{tls_cert}" key_path = "{tls_key}" [storage] -# backend = "postgres" # Only "postgres" is supported +backend = "{backend}" -[storage.postgres] +[storage.{backend}] connection_string = "{catalog_url}" # pool_size = 20 # Max connections for data operations (default 20, min 10) # catalog_pool_size = # Max connections for management/catalog ops (defaults to pool_size, min 10) @@ -161,6 +165,8 @@ connection_string = "{catalog_url}" # max_import_bytes = 10737418240 # Maximum import file size (10 GB) "#, + backend_env = backend.to_uppercase(), + connection_scheme = connection_scheme, ); std::fs::write(config_path, &toml)?; diff --git a/crates/bin/src/main.rs b/crates/bin/src/main.rs index b4f61bf..92e5e2c 100755 --- a/crates/bin/src/main.rs +++ b/crates/bin/src/main.rs @@ -25,6 +25,11 @@ mod serve_helpers; mod util; mod workers; +#[cfg(feature = "postgres")] +use extenddb_storage_postgres as _; +#[cfg(feature = "tidb")] +use extenddb_storage_tidb as _; + use clap::{Parser, Subcommand}; #[derive(Parser)] diff --git a/crates/bin/src/manage_http.rs b/crates/bin/src/manage_http.rs index 48d1062..5cf90ae 100755 --- a/crates/bin/src/manage_http.rs +++ b/crates/bin/src/manage_http.rs @@ -327,7 +327,7 @@ pub fn dispatch( if !yes { anyhow::bail!( "--yes is required for import-access-key. This command stores a real AWS \ - secret access key in the local PostgreSQL database." + secret access key in the configured storage backend." ); } c.req("POST", &format!("/management/accounts/{account_id}/users/{user_name}/access-keys/import"), diff --git a/crates/bin/src/serve_helpers.rs b/crates/bin/src/serve_helpers.rs index c5b1c87..9eaab94 100755 --- a/crates/bin/src/serve_helpers.rs +++ b/crates/bin/src/serve_helpers.rs @@ -60,7 +60,7 @@ pub fn verify_daemon_started(pid_file: &PathBuf, bind_addr: &str) -> anyhow::Res std::thread::sleep(std::time::Duration::from_millis(100)); }; - // Give the daemon a moment to initialize (connect to Postgres, load TLS + // Give the daemon a moment to initialize (connect to storage, load TLS // certs, etc.). Check every 200ms for up to 3 seconds. let check_deadline = std::time::Instant::now() + std::time::Duration::from_secs(3); loop { diff --git a/crates/bin/src/workers.rs b/crates/bin/src/workers.rs index c06188a..259e153 100755 --- a/crates/bin/src/workers.rs +++ b/crates/bin/src/workers.rs @@ -8,8 +8,8 @@ //! TTL cleanup, table size refresh, stream record expiry, idempotency token //! cleanup, capacity warning, and metrics pruning. //! -//! Workers are generic over storage traits so they are decoupled from the -//! concrete `PostgresEngine` / `PostgresCatalogStore` types. +//! Workers are generic over storage traits so they are decoupled from concrete +//! backend engine and catalog-store types. use std::sync::Arc; diff --git a/crates/engine/src/backup.rs b/crates/engine/src/backup.rs index 47dff47..5393b7f 100755 --- a/crates/engine/src/backup.rs +++ b/crates/engine/src/backup.rs @@ -59,6 +59,8 @@ pub(crate) async fn handle_describe_backup( ) })?; + ensure_backup_arn_account(backup_arn, &ctx.account_id)?; + let desc = ctx .storage .describe_backup(backup_arn) @@ -100,6 +102,8 @@ pub(crate) async fn handle_delete_backup( ) })?; + ensure_backup_arn_account(backup_arn, &ctx.account_id)?; + let desc = ctx .storage .delete_backup(backup_arn) @@ -135,6 +139,8 @@ pub(crate) async fn handle_restore_table_from_backup( ) })?; + ensure_backup_arn_account(backup_arn, &ctx.account_id)?; + let desc = ctx .storage .restore_table_from_backup(&ctx.account_id, target_table_name, backup_arn) @@ -210,15 +216,26 @@ pub(crate) async fn handle_restore_table_to_point_in_time( _body: Value, _ctx: &OperationContext, ) -> Result { - // TODO(fidelity): Implement real PITR using PostgreSQL temporal/history - // table approach — item_history table capturing every mutation, DISTINCT ON - // query to reconstruct state at time T, 35-day retention via background - // pruning. + // TODO(fidelity): Implement real PITR through a storage-level history + // contract so each backend can reconstruct state at time T with the same + // DynamoDB-visible semantics. Err(DynamoDbError::ValidationException( "Point-in-time recovery restore is not yet supported".to_owned(), )) } +fn ensure_backup_arn_account(backup_arn: &str, account_id: &str) -> Result<(), DynamoDbError> { + let arn_account = backup_arn.split(':').nth(4).ok_or_else(|| { + DynamoDbError::ValidationException(format!("Invalid backup ARN: {backup_arn}")) + })?; + if arn_account != account_id { + return Err(DynamoDbError::AccessDeniedException( + "Access denied for backup ARN".to_owned(), + )); + } + Ok(()) +} + /// Convert storage errors to DynamoDB errors. fn storage_err_to_dynamo(e: extenddb_storage::error::StorageError) -> DynamoDbError { match e { diff --git a/crates/engine/src/ttl.rs b/crates/engine/src/ttl.rs index dffa6f6..ea54cfd 100755 --- a/crates/engine/src/ttl.rs +++ b/crates/engine/src/ttl.rs @@ -44,9 +44,8 @@ pub async fn handle_describe_time_to_live( /// Handle `UpdateTimeToLive` — enable or disable TTL on a table attribute. /// -/// When enabling, kicks off creation of a `PostgreSQL` expression index on the -/// TTL attribute. When disabling, marks TTL disabled (sweeper stops) then -/// drops the index. +/// When enabling, kicks off backend-specific TTL lookup setup. When disabling, +/// removes backend lookup/TTL artifacts before marking TTL disabled. /// /// # Errors /// @@ -84,6 +83,13 @@ pub async fn handle_update_time_to_live( )); } + if !input.time_to_live_specification.enabled { + ctx.storage + .drop_ttl_index(&ctx.account_id, &input.table_name) + .await + .map_err(storage_to_dynamo)?; + } + ctx.storage .update_ttl( &ctx.account_id, @@ -95,9 +101,8 @@ pub async fn handle_update_time_to_live( .map_err(storage_to_dynamo)?; if input.time_to_live_specification.enabled { - // Kick off index creation (CONCURRENTLY — non-blocking for other database - // operations on the table, but the handler awaits completion). - // If it fails, the TTL sweeper will retry on its next cycle. + // Kick off backend-specific TTL lookup creation. If it fails, the TTL + // sweeper will retry on its next cycle. let account_id = ctx.account_id.clone(); let table_name = input.table_name.clone(); let attr = input.time_to_live_specification.attribute_name.clone(); @@ -108,16 +113,6 @@ pub async fn handle_update_time_to_live( { tracing::warn!("TTL index creation deferred for {table_name}: {e}"); } - } else { - // Disable path: metadata already updated (sweeper won't pick up this table). - // Drop the index. Safe because sweeper checks ttl_index_ready which is now FALSE. - if let Err(e) = ctx - .storage - .drop_ttl_index(&ctx.account_id, &input.table_name) - .await - { - tracing::warn!("TTL index drop failed for {}: {e}", input.table_name); - } } let output = UpdateTimeToLiveOutput { @@ -132,7 +127,7 @@ pub async fn handle_update_time_to_live( /// Validate a TTL attribute name. /// /// Real `DynamoDB` allows any UTF-8 (1–255 bytes). However, the TTL attribute -/// name is interpolated into `PostgreSQL` DDL (expression index creation) where +/// name is interpolated into backend DDL (TTL lookup/native TTL setup) where /// parameterized queries are not possible. We use a strict allowlist: /// `^[a-zA-Z0-9._-]+$` (1–255 bytes). This eliminates the entire class of /// SQL injection risk. See `docs/differences-from-dynamodb.md`. diff --git a/crates/engine/src/update_table.rs b/crates/engine/src/update_table.rs index b947812..c658391 100755 --- a/crates/engine/src/update_table.rs +++ b/crates/engine/src/update_table.rs @@ -90,6 +90,12 @@ pub async fn handle_update_table( // Validate GSI updates: each entry must have exactly one of Create, Update, or Delete. if let Some(updates) = &input.global_secondary_index_updates { + if updates.len() > 1 { + return Err(DynamoDbError::ValidationException( + "One or more parameter values were invalid: Only one GlobalSecondaryIndexUpdate can be specified per UpdateTable operation".to_owned(), + )); + } + for update in updates { if update.create.is_some() && update.delete.is_some() { return Err(DynamoDbError::ValidationException( diff --git a/crates/storage-postgres/src/bootstrapper.rs b/crates/storage-postgres/src/bootstrapper.rs index 524e9ac..9ffbeee 100755 --- a/crates/storage-postgres/src/bootstrapper.rs +++ b/crates/storage-postgres/src/bootstrapper.rs @@ -8,7 +8,9 @@ //! lazily as needed during the bootstrap sequence. use async_trait::async_trait; -use extenddb_storage::bootstrapper::{AdminBootstrapResult, BootstrapConfig, Bootstrapper}; +use extenddb_storage::bootstrapper::{ + AdminBootstrapResult, BootstrapConfig, BootstrapOptions, Bootstrapper, +}; use extenddb_storage::management_store::{OpError, OpResult}; use sqlx::PgPool; use sqlx::postgres::{PgConnectOptions, PgPoolOptions}; @@ -476,24 +478,13 @@ fn generate_random_password() -> String { } impl PostgresBootstrapper { - /// Create a bootstrapper from config file and CLI args. Parses - /// Postgres-specific arguments and merges with config. + /// Create a bootstrapper from config file and typed CLI overrides. pub async fn from_config( config_path: &str, - cli_args: &[String], + options: BootstrapOptions, ) -> Result { use extenddb_storage::error::StorageError; - // Extract Postgres-specific CLI args - let pg_host = extract_arg(cli_args, "--pg-host"); - let pg_port = extract_arg(cli_args, "--pg-port").and_then(|s| s.parse().ok()); - let pg_user = extract_arg(cli_args, "--pg-user"); - let pg_pass = extract_arg(cli_args, "--pg-pass"); - let data_db = extract_arg(cli_args, "--data-db"); - let catalog_db = extract_arg(cli_args, "--catalog-db"); - let extenddb_user = extract_arg(cli_args, "--extenddb-user"); - let extenddb_pass = extract_arg(cli_args, "--extenddb-pass"); - // Load config file if it exists let (host, port, user, password, catalog_db_name) = if std::path::Path::new(config_path) .exists() @@ -519,12 +510,16 @@ impl PostgresBootstrapper { .map_err(|e| StorageError::Internal(format!("Invalid connection string: {e}")))?; // Check for conflicts between CLI args and config values - check_conflict(pg_host.as_ref(), &parts.host, "--pg-host")?; - check_conflict(pg_port.as_ref(), &parts.port, "--pg-port")?; - check_conflict(extenddb_user.as_ref(), &parts.user, "--extenddb-user")?; - check_conflict(extenddb_pass.as_ref(), &parts.password, "--extenddb-pass")?; - - if let Some(ref cli_catalog) = catalog_db { + check_conflict(options.storage_host.as_ref(), &parts.host, "--storage-host")?; + check_conflict(options.storage_port.as_ref(), &parts.port, "--storage-port")?; + check_conflict(options.app_user.as_ref(), &parts.user, "--extenddb-user")?; + check_conflict( + options.app_password.as_ref(), + &parts.password, + "--extenddb-pass", + )?; + + if let Some(ref cli_catalog) = options.catalog_db { if cli_catalog != &parts.database { return Err(StorageError::Internal(format!( "--catalog-db '{}' conflicts with config file catalog database '{}'", @@ -552,25 +547,26 @@ impl PostgresBootstrapper { }; // CLI args override config (or use config values if no CLI arg provided) - let resolved_host = pg_host.unwrap_or(host); - let resolved_port = pg_port.unwrap_or(port); - let resolved_admin_user = pg_user + let resolved_host = options.storage_host.unwrap_or(host); + let resolved_port = options.storage_port.unwrap_or(port); + let resolved_admin_user = options + .admin_user .unwrap_or_else(|| std::env::var("USER").unwrap_or_else(|_| "postgres".to_owned())); - let resolved_catalog_db = catalog_db.unwrap_or(catalog_db_name); - let final_data_db = data_db.unwrap_or_else(|| { + let resolved_catalog_db = options.catalog_db.unwrap_or(catalog_db_name); + let final_data_db = options.data_db.unwrap_or_else(|| { resolved_catalog_db .strip_suffix("_catalog") .unwrap_or(&resolved_catalog_db) .to_owned() }); - let resolved_app_user = extenddb_user.unwrap_or(user); - let resolved_app_password = extenddb_pass.unwrap_or(password); + let resolved_app_user = options.app_user.unwrap_or(user); + let resolved_app_password = options.app_password.unwrap_or(password); let config = BootstrapConfig { host: resolved_host, port: resolved_port, admin_user: resolved_admin_user, - admin_password: pg_pass, + admin_password: options.admin_password, app_user: resolved_app_user, app_password: resolved_app_password, catalog_db: resolved_catalog_db, @@ -599,8 +595,3 @@ fn check_conflict( } Ok(()) } - -/// Extract a CLI argument value by flag name. -fn extract_arg(args: &[String], flag: &str) -> Option { - args.windows(2).find(|w| w[0] == flag).map(|w| w[1].clone()) -} diff --git a/crates/storage-postgres/src/lib.rs b/crates/storage-postgres/src/lib.rs index b70423a..b5b5130 100755 --- a/crates/storage-postgres/src/lib.rs +++ b/crates/storage-postgres/src/lib.rs @@ -42,9 +42,9 @@ pub use credential_store::DbCredentialStore; inventory::submit! { extenddb_storage::bootstrapper::BackendRegistration { name: "postgres", - factory: |config_path, cli_args| { + factory: |config_path, options| { Box::pin(async move { - let store = PostgresBootstrapper::from_config(&config_path, &cli_args).await?; + let store = PostgresBootstrapper::from_config(&config_path, options).await?; Ok(Box::new(store) as Box) }) } @@ -68,6 +68,10 @@ inventory::submit! { .map_err(|e: toml::de::Error| format!("Failed to parse postgres config: {}", e))?; Ok(Box::new(config) as Box) }, + default_config: || { + Box::new(PostgresStorageConfig::default()) as Box + }, + default_priority: Some(100), } } diff --git a/crates/storage-tidb/Cargo.toml b/crates/storage-tidb/Cargo.toml new file mode 100755 index 0000000..694033c --- /dev/null +++ b/crates/storage-tidb/Cargo.toml @@ -0,0 +1,33 @@ +# Copyright 2026 ExtendDB contributors +# SPDX-License-Identifier: Apache-2.0 +[package] +name = "extenddb-storage-tidb" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true + +[dependencies] +anyhow = { workspace = true } +extenddb-core = { workspace = true } +extenddb-storage = { workspace = true } +extenddb-auth = { workspace = true } +futures = { workspace = true } +inventory = { workspace = true } +sqlx = { workspace = true } +tokio = { workspace = true, features = ["sync"] } +serde = { workspace = true } +serde_json = { workspace = true } +toml = { workspace = true } +tracing = { workspace = true } +time = { workspace = true } +percent-encoding = { workspace = true } +url = { workspace = true } +uuid = { workspace = true } +base64 = { workspace = true } +crc32fast = { workspace = true } +rand = { workspace = true } +bcrypt = { workspace = true } +aes-gcm = { workspace = true } +async-trait = { workspace = true } +zeroize = { workspace = true } diff --git a/crates/storage-tidb/data_migrations/001_data_schema.sql b/crates/storage-tidb/data_migrations/001_data_schema.sql new file mode 100755 index 0000000..4614392 --- /dev/null +++ b/crates/storage-tidb/data_migrations/001_data_schema.sql @@ -0,0 +1,46 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Data database schema for extenddb. +-- These tables live in the data database (separate from the catalog) so that +-- stream records and idempotency tokens can be written atomically with item +-- data within a single TiDB transaction (P54 Bug 1). + +-- Stream shards — fixed shards per table, assigned by partition key hash. +-- No FK to catalog tables (cross-database FKs are not possible). +-- Application-level integrity ensures table_id validity. +CREATE TABLE IF NOT EXISTS stream_shards ( + shard_id VARCHAR(128) PRIMARY KEY CLUSTERED, + table_id VARCHAR(64) NOT NULL, + parent_shard_id VARCHAR(128), + starting_sequence_number VARCHAR(64) NOT NULL, + ending_sequence_number VARCHAR(64), + next_sequence_number BIGINT NOT NULL DEFAULT 0, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) +); + +CREATE INDEX IF NOT EXISTS idx_stream_shards_table + ON stream_shards (table_id); + +-- Stream records — change data capture records. +CREATE TABLE IF NOT EXISTS stream_records ( + shard_id VARCHAR(128) NOT NULL REFERENCES stream_shards(shard_id) ON DELETE CASCADE, + sequence_number VARCHAR(64) NOT NULL, + table_id VARCHAR(64) NOT NULL, + event_name VARCHAR(32) NOT NULL, + record_data JSON NOT NULL, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + PRIMARY KEY (shard_id, sequence_number) CLUSTERED +) TTL = `created_at` + INTERVAL 24 HOUR TTL_JOB_INTERVAL = '1h'; + +CREATE INDEX IF NOT EXISTS idx_stream_records_created + ON stream_records (created_at); + +-- Idempotency token storage for TransactWriteItems. +CREATE TABLE IF NOT EXISTS idempotency_tokens ( + token VARCHAR(255) PRIMARY KEY CLUSTERED, + fingerprint TEXT NOT NULL, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) +) TTL = `created_at` + INTERVAL 600 SECOND TTL_JOB_INTERVAL = '10m'; + +CREATE INDEX IF NOT EXISTS idx_idempotency_tokens_created + ON idempotency_tokens (created_at); diff --git a/crates/storage-tidb/migrations/001_schema.sql b/crates/storage-tidb/migrations/001_schema.sql new file mode 100644 index 0000000..0cf3a51 --- /dev/null +++ b/crates/storage-tidb/migrations/001_schema.sql @@ -0,0 +1,287 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Consolidated catalog schema for extenddb (catalog version 0.0.2). +-- This is the complete schema applied on fresh installs. + +-- Accounts — multi-account support (REQ-AUTH-005). +CREATE TABLE IF NOT EXISTS accounts ( + account_id VARCHAR(32) PRIMARY KEY CLUSTERED, + account_name VARCHAR(255) NOT NULL UNIQUE, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) +); + +-- Table metadata. +CREATE TABLE IF NOT EXISTS tables ( + account_id VARCHAR(32) NOT NULL REFERENCES accounts(account_id) ON DELETE CASCADE, + table_name VARCHAR(255) NOT NULL, + key_schema JSON NOT NULL, + attribute_definitions JSON NOT NULL, + billing_mode VARCHAR(32) NOT NULL DEFAULT 'PAY_PER_REQUEST', + provisioned_throughput JSON, + stream_specification JSON, + table_status VARCHAR(32) NOT NULL DEFAULT 'CREATING', + creation_date_time TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + table_size_bytes BIGINT NOT NULL DEFAULT 0, + item_count BIGINT NOT NULL DEFAULT 0, + table_arn VARCHAR(512) NOT NULL, + table_id VARCHAR(64) NOT NULL, + ttl_attribute VARCHAR(255), + deletion_protection_enabled BOOLEAN NOT NULL DEFAULT FALSE, + status_transition_at TIMESTAMP(6), + stream_label VARCHAR(64), + ttl_index_ready BOOLEAN NOT NULL DEFAULT FALSE, + ttl_native_enabled BOOLEAN NOT NULL DEFAULT FALSE, + control_plane_token VARCHAR(64), + control_plane_lease_until TIMESTAMP(6), + PRIMARY KEY (account_id, table_name) CLUSTERED, + CONSTRAINT tables_table_id_unique UNIQUE (table_id) +); + +CREATE INDEX idx_tables_pending_transition + ON tables (status_transition_at); + +CREATE INDEX idx_tables_control_plane_work + ON tables (table_status, status_transition_at, control_plane_lease_until); + +-- Index metadata. +CREATE TABLE IF NOT EXISTS indexes ( + table_id VARCHAR(64) NOT NULL, + index_id VARCHAR(64) NOT NULL, + index_name VARCHAR(255) NOT NULL, + index_type VARCHAR(16) NOT NULL, + key_schema JSON NOT NULL, + projection JSON NOT NULL, + index_status VARCHAR(32) NOT NULL DEFAULT 'ACTIVE', + provisioned_throughput JSON, + PRIMARY KEY (table_id, index_name) CLUSTERED, + CONSTRAINT indexes_table_id_fkey + FOREIGN KEY (table_id) REFERENCES tables(table_id) ON DELETE CASCADE +); + +-- Resource tags. +CREATE TABLE IF NOT EXISTS tags ( + resource_arn VARCHAR(512) NOT NULL, + tag_key VARCHAR(255) NOT NULL, + tag_value TEXT NOT NULL, + PRIMARY KEY (resource_arn, tag_key) CLUSTERED +); + +-- Migration tracking. +CREATE TABLE IF NOT EXISTS schema_history ( + filename VARCHAR(255) PRIMARY KEY CLUSTERED, + applied_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) +); + +-- Settings (catalog version, data database connection, runtime config). +CREATE TABLE IF NOT EXISTS settings ( + `key` VARCHAR(255) PRIMARY KEY CLUSTERED, + value TEXT NOT NULL +); + +-- Admin users. +CREATE TABLE IF NOT EXISTS admin_users ( + admin_name VARCHAR(255) PRIMARY KEY CLUSTERED, + password_hash TEXT NOT NULL, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) +); + +-- IAM users. +CREATE TABLE IF NOT EXISTS iam_users ( + account_id VARCHAR(32) NOT NULL REFERENCES accounts(account_id) ON DELETE CASCADE, + user_name VARCHAR(255) NOT NULL, + user_arn VARCHAR(512) NOT NULL UNIQUE, + password_hash TEXT, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + PRIMARY KEY (account_id, user_name) CLUSTERED +); + +-- IAM user tags. +CREATE TABLE IF NOT EXISTS iam_user_tags ( + account_id VARCHAR(32) NOT NULL, + user_name VARCHAR(255) NOT NULL, + tag_key VARCHAR(255) NOT NULL, + tag_value TEXT NOT NULL, + PRIMARY KEY (account_id, user_name, tag_key) CLUSTERED, + FOREIGN KEY (account_id, user_name) REFERENCES iam_users(account_id, user_name) ON DELETE CASCADE +); + +-- Access keys. +CREATE TABLE IF NOT EXISTS access_keys ( + access_key_id VARCHAR(128) PRIMARY KEY CLUSTERED, + secret_key_encrypted BLOB NOT NULL, + account_id VARCHAR(32) NOT NULL, + user_name VARCHAR(255) NOT NULL, + is_active BOOLEAN NOT NULL DEFAULT TRUE, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + FOREIGN KEY (account_id, user_name) REFERENCES iam_users(account_id, user_name) ON DELETE CASCADE +); + +-- IAM groups. +CREATE TABLE IF NOT EXISTS iam_groups ( + account_id VARCHAR(32) NOT NULL REFERENCES accounts(account_id) ON DELETE CASCADE, + group_name VARCHAR(255) NOT NULL, + group_arn VARCHAR(512) NOT NULL UNIQUE, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + PRIMARY KEY (account_id, group_name) CLUSTERED +); + +-- IAM group membership. +CREATE TABLE IF NOT EXISTS iam_group_members ( + account_id VARCHAR(32) NOT NULL, + group_name VARCHAR(255) NOT NULL, + user_name VARCHAR(255) NOT NULL, + PRIMARY KEY (account_id, group_name, user_name) CLUSTERED, + FOREIGN KEY (account_id, group_name) REFERENCES iam_groups(account_id, group_name) ON DELETE CASCADE, + FOREIGN KEY (account_id, user_name) REFERENCES iam_users(account_id, user_name) ON DELETE CASCADE +); + +-- IAM roles. +CREATE TABLE IF NOT EXISTS iam_roles ( + account_id VARCHAR(32) NOT NULL REFERENCES accounts(account_id) ON DELETE CASCADE, + role_name VARCHAR(255) NOT NULL, + role_arn VARCHAR(512) NOT NULL UNIQUE, + trust_policy JSON NOT NULL, + permissions_boundary_arn VARCHAR(512), + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + PRIMARY KEY (account_id, role_name) CLUSTERED +); + +-- IAM role tags. +CREATE TABLE IF NOT EXISTS iam_role_tags ( + account_id VARCHAR(32) NOT NULL, + role_name VARCHAR(255) NOT NULL, + tag_key VARCHAR(255) NOT NULL, + tag_value TEXT NOT NULL, + PRIMARY KEY (account_id, role_name, tag_key) CLUSTERED, + FOREIGN KEY (account_id, role_name) REFERENCES iam_roles(account_id, role_name) ON DELETE CASCADE +); + +-- IAM sessions. +CREATE TABLE IF NOT EXISTS iam_sessions ( + session_token VARCHAR(512) PRIMARY KEY CLUSTERED, + access_key_id VARCHAR(128) NOT NULL UNIQUE, + secret_key_encrypted BLOB NOT NULL, + account_id VARCHAR(32) NOT NULL, + role_name VARCHAR(255) NOT NULL, + session_name VARCHAR(255) NOT NULL, + session_tags JSON, + session_policy JSON, + expires_at TIMESTAMP(6) NOT NULL, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + FOREIGN KEY (account_id, role_name) REFERENCES iam_roles(account_id, role_name) ON DELETE CASCADE +) TTL = `expires_at` + INTERVAL 24 HOUR TTL_JOB_INTERVAL = '1h'; + +-- IAM policies. +CREATE TABLE IF NOT EXISTS iam_policies ( + account_id VARCHAR(32) NOT NULL REFERENCES accounts(account_id) ON DELETE CASCADE, + principal_type VARCHAR(16) NOT NULL CHECK (principal_type IN ('user', 'group', 'role')), + principal_name VARCHAR(255) NOT NULL, + policy_name VARCHAR(255) NOT NULL, + policy_document JSON NOT NULL, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + PRIMARY KEY (account_id, principal_type, principal_name, policy_name) CLUSTERED +); + +-- IAM permissions boundaries. +CREATE TABLE IF NOT EXISTS iam_permissions_boundaries ( + account_id VARCHAR(32) NOT NULL REFERENCES accounts(account_id) ON DELETE CASCADE, + principal_type VARCHAR(16) NOT NULL CHECK (principal_type IN ('user', 'role')), + principal_name VARCHAR(255) NOT NULL, + policy_document JSON NOT NULL, + PRIMARY KEY (account_id, principal_type, principal_name) CLUSTERED +); + +-- Idempotency tokens for TransactWriteItems. +CREATE TABLE IF NOT EXISTS idempotency_tokens ( + token VARCHAR(255) PRIMARY KEY CLUSTERED, + fingerprint TEXT NOT NULL, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) +); + +CREATE INDEX idx_idempotency_tokens_created ON idempotency_tokens (created_at); + +-- Metrics (1-minute aggregation). +CREATE TABLE IF NOT EXISTS metrics ( + bucket TIMESTAMP(6) NOT NULL, + metric VARCHAR(255) CHARACTER SET ascii COLLATE ascii_bin NOT NULL, + table_name VARCHAR(255) CHARACTER SET ascii COLLATE ascii_bin NOT NULL DEFAULT '', + index_name VARCHAR(255) CHARACTER SET ascii COLLATE ascii_bin NOT NULL DEFAULT '', + operation VARCHAR(255) CHARACTER SET ascii COLLATE ascii_bin NOT NULL DEFAULT '', + sum DOUBLE NOT NULL DEFAULT 0, + count BIGINT NOT NULL DEFAULT 0, + min DOUBLE NOT NULL DEFAULT 1.79e308, + max DOUBLE NOT NULL DEFAULT -1.79e308, + PRIMARY KEY (bucket, metric, table_name, index_name, operation) CLUSTERED +) TTL = `bucket` + INTERVAL 24 HOUR TTL_JOB_INTERVAL = '1h'; + +CREATE INDEX idx_metrics_bucket ON metrics (bucket); + +-- Login attempt tracking. +CREATE TABLE IF NOT EXISTS login_attempts ( + principal VARCHAR(512) NOT NULL, + attempted_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6), + success BOOLEAN NOT NULL, + source_ip VARCHAR(255) +) TTL = `attempted_at` + INTERVAL 24 HOUR TTL_JOB_INTERVAL = '1h'; + +CREATE INDEX idx_login_attempts_principal_time + ON login_attempts (principal, attempted_at DESC); + +CREATE INDEX idx_login_attempts_source_ip_time + ON login_attempts (source_ip, attempted_at DESC); + +-- Backup metadata. +CREATE TABLE IF NOT EXISTS backups ( + backup_arn VARCHAR(512) PRIMARY KEY CLUSTERED, + backup_name VARCHAR(255) NOT NULL, + table_id VARCHAR(64) NOT NULL, + table_name VARCHAR(255) NOT NULL, + account_id VARCHAR(32) NOT NULL, + backup_status VARCHAR(32) NOT NULL DEFAULT 'AVAILABLE', + backup_type VARCHAR(32) NOT NULL DEFAULT 'USER', + backup_size_bytes BIGINT NOT NULL DEFAULT 0, + item_count BIGINT NOT NULL DEFAULT 0, + key_schema JSON NOT NULL, + attribute_definitions JSON NOT NULL, + billing_mode VARCHAR(32) NOT NULL DEFAULT 'PAY_PER_REQUEST', + provisioned_throughput JSON, + stream_specification JSON, + deletion_protection_enabled BOOLEAN NOT NULL DEFAULT FALSE, + created_at TIMESTAMP(6) NOT NULL DEFAULT CURRENT_TIMESTAMP(6) +); + +CREATE INDEX idx_backups_table ON backups (account_id, table_name); + +-- Backup index metadata snapshot. +CREATE TABLE IF NOT EXISTS backup_indexes ( + backup_arn VARCHAR(512) NOT NULL REFERENCES backups(backup_arn) ON DELETE CASCADE, + index_id VARCHAR(64) NOT NULL, + index_name VARCHAR(255) NOT NULL, + index_type VARCHAR(16) NOT NULL, + key_schema JSON NOT NULL, + projection JSON NOT NULL, + provisioned_throughput JSON, + PRIMARY KEY (backup_arn, index_name) CLUSTERED +); + +-- Backup tag snapshot. +CREATE TABLE IF NOT EXISTS backup_tags ( + backup_arn VARCHAR(512) NOT NULL REFERENCES backups(backup_arn) ON DELETE CASCADE, + tag_key VARCHAR(255) NOT NULL, + tag_value TEXT NOT NULL, + PRIMARY KEY (backup_arn, tag_key) CLUSTERED +); + +-- Continuous backups / PITR status. +CREATE TABLE IF NOT EXISTS continuous_backups ( + account_id VARCHAR(32) NOT NULL, + table_name VARCHAR(255) NOT NULL, + pitr_enabled BOOLEAN NOT NULL DEFAULT FALSE, + earliest_restorable TIMESTAMP(6), + latest_restorable TIMESTAMP(6), + PRIMARY KEY (account_id, table_name) CLUSTERED +); + +-- Seed settings. +INSERT IGNORE INTO settings (`key`, value) VALUES ('catalog_version', '0.0.2'); +INSERT IGNORE INTO settings (`key`, value) VALUES ('control_plane_delay_seconds', '0.25'); diff --git a/crates/storage-tidb/migrations/002_backup_metadata_fidelity.sql b/crates/storage-tidb/migrations/002_backup_metadata_fidelity.sql new file mode 100644 index 0000000..ad70dc4 --- /dev/null +++ b/crates/storage-tidb/migrations/002_backup_metadata_fidelity.sql @@ -0,0 +1,26 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Backup metadata fidelity for catalog version 0.0.3. + +ALTER TABLE backups + ADD COLUMN IF NOT EXISTS deletion_protection_enabled BOOLEAN NOT NULL DEFAULT FALSE; + +CREATE TABLE IF NOT EXISTS backup_indexes ( + backup_arn VARCHAR(512) NOT NULL REFERENCES backups(backup_arn) ON DELETE CASCADE, + index_id VARCHAR(64) NOT NULL, + index_name VARCHAR(255) NOT NULL, + index_type VARCHAR(16) NOT NULL, + key_schema JSON NOT NULL, + projection JSON NOT NULL, + provisioned_throughput JSON, + PRIMARY KEY (backup_arn, index_name) CLUSTERED +); + +CREATE TABLE IF NOT EXISTS backup_tags ( + backup_arn VARCHAR(512) NOT NULL REFERENCES backups(backup_arn) ON DELETE CASCADE, + tag_key VARCHAR(255) NOT NULL, + tag_value TEXT NOT NULL, + PRIMARY KEY (backup_arn, tag_key) CLUSTERED +); + +UPDATE settings SET value = '0.0.3' WHERE `key` = 'catalog_version'; diff --git a/crates/storage-tidb/migrations/003_drop_catalog_stream_data.sql b/crates/storage-tidb/migrations/003_drop_catalog_stream_data.sql new file mode 100644 index 0000000..ef48502 --- /dev/null +++ b/crates/storage-tidb/migrations/003_drop_catalog_stream_data.sql @@ -0,0 +1,9 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Stream shards, stream records, and idempotency tokens live in the TiDB data +-- database so item writes, stream capture, and idempotency checks can commit in +-- one transaction. Keep the catalog database limited to control-plane metadata. + +DROP TABLE IF EXISTS stream_records; +DROP TABLE IF EXISTS stream_shards; +DROP TABLE IF EXISTS stream_sequence; diff --git a/crates/storage-tidb/migrations/004_control_plane_leases.sql b/crates/storage-tidb/migrations/004_control_plane_leases.sql new file mode 100644 index 0000000..fdcc6b8 --- /dev/null +++ b/crates/storage-tidb/migrations/004_control_plane_leases.sql @@ -0,0 +1,12 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Durable, short-lived control-plane claims for TiDB background reconciliation. + +ALTER TABLE tables + ADD COLUMN IF NOT EXISTS control_plane_token VARCHAR(64), + ADD COLUMN IF NOT EXISTS control_plane_lease_until TIMESTAMP(6); + +CREATE INDEX IF NOT EXISTS idx_tables_control_plane_work + ON tables (table_status, status_transition_at, control_plane_lease_until); + +UPDATE settings SET value = '0.0.4' WHERE `key` = 'catalog_version'; diff --git a/crates/storage-tidb/migrations/006_native_ttl_mode.sql b/crates/storage-tidb/migrations/006_native_ttl_mode.sql new file mode 100644 index 0000000..530f590 --- /dev/null +++ b/crates/storage-tidb/migrations/006_native_ttl_mode.sql @@ -0,0 +1,8 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Track whether a table's DynamoDB TTL is delegated to TiDB native TTL. + +ALTER TABLE tables + ADD COLUMN IF NOT EXISTS ttl_native_enabled BOOLEAN NOT NULL DEFAULT FALSE; + +UPDATE settings SET value = '0.0.6' WHERE `key` = 'catalog_version'; diff --git a/crates/storage-tidb/migrations/007_native_br_backups.sql b/crates/storage-tidb/migrations/007_native_br_backups.sql new file mode 100644 index 0000000..a09da16 --- /dev/null +++ b/crates/storage-tidb/migrations/007_native_br_backups.sql @@ -0,0 +1,28 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Use TiDB BR as the TiDB backend's backup data plane. + +ALTER TABLE backups + ADD COLUMN IF NOT EXISTS backup_backend VARCHAR(32) NOT NULL DEFAULT 'legacy-logical'; + +ALTER TABLE backups + ADD COLUMN IF NOT EXISTS storage_uri TEXT; + +ALTER TABLE backups + ADD COLUMN IF NOT EXISTS physical_table_name VARCHAR(255); + +ALTER TABLE backups + ADD COLUMN IF NOT EXISTS native_snapshot_tso VARCHAR(64); + +-- Old TiDB logical backups depended on catalog-row copies. They cannot be +-- restored after moving TiDB to native BR semantics, so hide them from list +-- operations instead of pretending they are usable native backups. +UPDATE backups +SET backup_status = 'DELETED' +WHERE backup_backend = 'legacy-logical' + AND storage_uri IS NULL + AND backup_status != 'DELETED'; + +DROP TABLE IF EXISTS backup_items; + +UPDATE settings SET value = '0.0.7' WHERE `key` = 'catalog_version'; diff --git a/crates/storage-tidb/migrations/008_native_index_backup_ids.sql b/crates/storage-tidb/migrations/008_native_index_backup_ids.sql new file mode 100644 index 0000000..55c1253 --- /dev/null +++ b/crates/storage-tidb/migrations/008_native_index_backup_ids.sql @@ -0,0 +1,14 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Preserve native TiDB generated-column/index identifiers in backup metadata. + +ALTER TABLE backup_indexes + ADD COLUMN IF NOT EXISTS index_id VARCHAR(64) NOT NULL DEFAULT ''; + +UPDATE backup_indexes bi +JOIN backups b ON b.backup_arn = bi.backup_arn +JOIN indexes i ON i.table_id = b.table_id AND i.index_name = bi.index_name +SET bi.index_id = i.index_id +WHERE bi.index_id = ''; + +UPDATE settings SET value = '0.0.8' WHERE `key` = 'catalog_version'; diff --git a/crates/storage-tidb/migrations/009_catalog_native_ttl.sql b/crates/storage-tidb/migrations/009_catalog_native_ttl.sql new file mode 100644 index 0000000..1810852 --- /dev/null +++ b/crates/storage-tidb/migrations/009_catalog_native_ttl.sql @@ -0,0 +1,13 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Delegate fixed-retention catalog tables to TiDB native TTL. + +ALTER TABLE metrics + TTL = `bucket` + INTERVAL 24 HOUR + TTL_JOB_INTERVAL = '1h'; + +ALTER TABLE login_attempts + TTL = `attempted_at` + INTERVAL 24 HOUR + TTL_JOB_INTERVAL = '1h'; + +UPDATE settings SET value = '0.0.9' WHERE `key` = 'catalog_version'; diff --git a/crates/storage-tidb/migrations/010_session_native_ttl.sql b/crates/storage-tidb/migrations/010_session_native_ttl.sql new file mode 100644 index 0000000..114e149 --- /dev/null +++ b/crates/storage-tidb/migrations/010_session_native_ttl.sql @@ -0,0 +1,9 @@ +-- Copyright 2026 ExtendDB contributors +-- SPDX-License-Identifier: Apache-2.0 +-- Delegate expired assume-role session retention to TiDB native TTL. + +ALTER TABLE iam_sessions + TTL = `expires_at` + INTERVAL 24 HOUR + TTL_JOB_INTERVAL = '1h'; + +UPDATE settings SET value = '0.0.10' WHERE `key` = 'catalog_version'; diff --git a/crates/storage-tidb/src/admin_store.rs b/crates/storage-tidb/src/admin_store.rs new file mode 100755 index 0000000..84df7e2 --- /dev/null +++ b/crates/storage-tidb/src/admin_store.rs @@ -0,0 +1,134 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `AdminStore` implementation for `TidbCatalogStore`. + +use extenddb_storage::management_store::{AdminEntry, OpError, OpResult}; +use futures::future::BoxFuture; + +use super::catalog_store::TidbCatalogStore; +use super::tidb_util::is_unique_violation; + +impl extenddb_storage::management_store::AdminStore for TidbCatalogStore { + fn create_admin(&self, admin_name: &str, password_hash: &str) -> BoxFuture<'_, OpResult<()>> { + let admin_name = admin_name.to_owned(); + let password_hash = password_hash.to_owned(); + Box::pin(async move { + let result = + sqlx::query("INSERT INTO admin_users (admin_name, password_hash) VALUES (?, ?)") + .bind(&admin_name) + .bind(&password_hash) + .execute(self.pool()) + .await; + match result { + Ok(_) => Ok(()), + Err(e) if is_unique_violation(&e) => Err(OpError::AlreadyExists( + "Admin user already exists".to_owned(), + )), + Err(e) => { + tracing::error!("create_admin failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + }) + } + + fn list_admins(&self) -> BoxFuture<'_, OpResult>> { + Box::pin(async move { + let rows: Vec<(String, time::OffsetDateTime)> = sqlx::query_as( + "SELECT admin_name, created_at FROM admin_users ORDER BY admin_name", + ) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_admins: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(rows + .into_iter() + .map(|(admin_name, created_at)| AdminEntry { + admin_name, + created_at, + }) + .collect()) + }) + } + + fn delete_admin(&self, admin_name: &str) -> BoxFuture<'_, OpResult<()>> { + let admin_name = admin_name.to_owned(); + Box::pin(async move { + let result = sqlx::query("DELETE FROM admin_users WHERE admin_name = ?") + .bind(&admin_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("Admin user not found".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("delete_admin failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + }) + } + + fn change_admin_password( + &self, + admin_name: &str, + password_hash: &str, + ) -> BoxFuture<'_, OpResult<()>> { + let admin_name = admin_name.to_owned(); + let password_hash = password_hash.to_owned(); + Box::pin(async move { + let result = + sqlx::query("UPDATE admin_users SET password_hash = ? WHERE admin_name = ?") + .bind(&password_hash) + .bind(&admin_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("Admin user not found".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("change_admin_password failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + }) + } + + fn verify_admin_password( + &self, + admin_name: &str, + password: &str, + ) -> BoxFuture<'_, OpResult>> { + let admin_name = admin_name.to_owned(); + let password = password.to_owned(); + Box::pin(async move { + let row: Option<(String,)> = + sqlx::query_as("SELECT password_hash FROM admin_users WHERE admin_name = ?") + .bind(&admin_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("verify_admin_password: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + let Some((hash,)) = row else { + return Ok(None); + }; + Ok(Some(verify_bcrypt(password, hash).await)) + }) + } +} + +/// Verify a bcrypt password on a blocking thread (same logic as server::password). +async fn verify_bcrypt(password: String, hash: String) -> bool { + tokio::task::spawn_blocking(move || bcrypt::verify(password, &hash).unwrap_or(false)) + .await + .unwrap_or(false) +} diff --git a/crates/storage-tidb/src/authorization_store.rs b/crates/storage-tidb/src/authorization_store.rs new file mode 100755 index 0000000..711b869 --- /dev/null +++ b/crates/storage-tidb/src/authorization_store.rs @@ -0,0 +1,255 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `AuthorizationStore` implementation for `TidbCatalogStore`. + +use extenddb_storage::authorization_store::{AuthorizationStore, SessionData}; +use extenddb_storage::management_store::{OpError, OpResult}; +use futures::future::BoxFuture; + +use super::catalog_store::TidbCatalogStore; + +impl AuthorizationStore for TidbCatalogStore { + fn fetch_user_policies( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_owned(); + let user_name = user_name.to_owned(); + Box::pin(async move { + let rows: Vec<(serde_json::Value,)> = sqlx::query_as( + "SELECT policy_document FROM iam_policies \ + WHERE account_id = ? AND principal_type = 'user' AND principal_name = ?", + ) + .bind(&account_id) + .bind(&user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_user_policies: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(rows.into_iter().map(|(v,)| v.to_string()).collect()) + }) + } + + fn fetch_user_group_policies( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_owned(); + let user_name = user_name.to_owned(); + Box::pin(async move { + let rows: Vec<(serde_json::Value,)> = sqlx::query_as( + "SELECT p.policy_document \ + FROM iam_policies p \ + JOIN iam_group_members gm ON p.account_id = gm.account_id \ + AND p.principal_type = 'group' \ + AND p.principal_name = gm.group_name \ + WHERE gm.account_id = ? AND gm.user_name = ?", + ) + .bind(&account_id) + .bind(&user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_user_group_policies: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(rows.into_iter().map(|(v,)| v.to_string()).collect()) + }) + } + + fn fetch_user_boundary( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_owned(); + let user_name = user_name.to_owned(); + Box::pin(async move { + let row: Option<(serde_json::Value,)> = sqlx::query_as( + "SELECT policy_document FROM iam_permissions_boundaries \ + WHERE account_id = ? AND principal_type = 'user' AND principal_name = ?", + ) + .bind(&account_id) + .bind(&user_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_user_boundary: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(row.map(|(v,)| v.to_string())) + }) + } + + fn fetch_role_policies( + &self, + account_id: &str, + role_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_owned(); + let role_name = role_name.to_owned(); + Box::pin(async move { + let rows: Vec<(serde_json::Value,)> = sqlx::query_as( + "SELECT policy_document FROM iam_policies \ + WHERE account_id = ? AND principal_type = 'role' AND principal_name = ?", + ) + .bind(&account_id) + .bind(&role_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_role_policies: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(rows.into_iter().map(|(v,)| v.to_string()).collect()) + }) + } + + fn fetch_role_boundary( + &self, + account_id: &str, + role_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_owned(); + let role_name = role_name.to_owned(); + Box::pin(async move { + let row: Option<(serde_json::Value,)> = sqlx::query_as( + "SELECT policy_document FROM iam_permissions_boundaries \ + WHERE account_id = ? AND principal_type = 'role' AND principal_name = ?", + ) + .bind(&account_id) + .bind(&role_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_role_boundary: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(row.map(|(v,)| v.to_string())) + }) + } + + fn fetch_session_data( + &self, + account_id: &str, + role_name: &str, + session_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_owned(); + let role_name = role_name.to_owned(); + let session_name = session_name.to_owned(); + Box::pin(async move { + let row: Option<(Option, Option)> = + sqlx::query_as( + "SELECT session_policy, session_tags FROM iam_sessions \ + WHERE account_id = ? AND role_name = ? AND session_name = ? \ + AND expires_at > now()", + ) + .bind(&account_id) + .bind(&role_name) + .bind(&session_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_session_data: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let Some((policy_value, tags_value)) = row else { + return Ok(None); + }; + + let session_policy = policy_value.map(|v| v.to_string()); + + let mut session_tags = Vec::new(); + if let Some(tags_val) = tags_value { + if let Some(arr) = tags_val.as_array() { + for tag in arr { + if let (Some(k), Some(v)) = ( + tag.get("Key").and_then(|k| k.as_str()), + tag.get("Value").and_then(|v| v.as_str()), + ) { + session_tags.push((k.to_owned(), v.to_owned())); + } + } + } else if let Some(obj) = tags_val.as_object() { + for (k, v) in obj { + if let Some(v_str) = v.as_str() { + session_tags.push((k.clone(), v_str.to_owned())); + } + } + } + } + + Ok(Some(SessionData { + session_policy, + session_tags, + })) + }) + } + + fn fetch_user_tags( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_owned(); + let user_name = user_name.to_owned(); + Box::pin(async move { + sqlx::query_as( + "SELECT tag_key, tag_value FROM iam_user_tags \ + WHERE account_id = ? AND user_name = ?", + ) + .bind(&account_id) + .bind(&user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_user_tags: {e}"); + OpError::Internal("Database error".to_owned()) + }) + }) + } + + fn fetch_role_tags( + &self, + account_id: &str, + role_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_owned(); + let role_name = role_name.to_owned(); + Box::pin(async move { + sqlx::query_as( + "SELECT tag_key, tag_value FROM iam_role_tags \ + WHERE account_id = ? AND role_name = ?", + ) + .bind(&account_id) + .bind(&role_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_role_tags: {e}"); + OpError::Internal("Database error".to_owned()) + }) + }) + } + + fn fetch_resource_tags(&self, arn: &str) -> BoxFuture<'_, OpResult>> { + let arn = arn.to_owned(); + Box::pin(async move { + sqlx::query_as("SELECT tag_key, tag_value FROM tags WHERE resource_arn = ?") + .bind(&arn) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_resource_tags: {e}"); + OpError::Internal("Database error".to_owned()) + }) + }) + } +} diff --git a/crates/storage-tidb/src/backup_engine.rs b/crates/storage-tidb/src/backup_engine.rs new file mode 100644 index 0000000..ff671b1 --- /dev/null +++ b/crates/storage-tidb/src/backup_engine.rs @@ -0,0 +1,1351 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Backup and point-in-time recovery implementation for TiDB storage. +//! +//! TiDB has native physical backup/restore through BR. This module deliberately +//! does not keep a logical `backup_items` copy path: if a requested DynamoDB +//! shape cannot be represented by BR without changing semantics, the TiDB +//! backend returns an explicit validation error. + +use std::ffi::OsString; +use std::path::PathBuf; + +use extenddb_core::types::{ + BackupDescription, BackupDetails, BackupSummary, BillingMode, ContinuousBackupsDescription, + CreateTableInput, GsiInput, KeySchemaElement, LsiInput, PointInTimeRecoveryDescription, + ProvisionedThroughput, SourceTableDetails, TableDescription, +}; +use extenddb_storage::BackupEngine; +use extenddb_storage::config::NativeBackupConfig; +use extenddb_storage::error::StorageError; +use futures::future::BoxFuture; +use tokio::process::Command; + +use crate::TidbEngine; +use crate::create_table::CreateTableActivation; +use crate::data::physical_data_table_name; +use crate::metadata_engine::drop_ttl_artifacts; +use crate::throughput::provisioned_throughput_from_description; + +const TIDB_BACKUP_BACKEND: &str = "tidb-br"; + +/// Current epoch milliseconds for unique ARN generation. +fn epoch_millis() -> u128 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() +} + +/// Convert a TiDB timestamp to epoch seconds as `f64`. +#[allow(clippy::cast_precision_loss)] +fn timestamp_to_epoch(ts: time::OffsetDateTime) -> f64 { + ts.unix_timestamp() as f64 +} + +#[derive(sqlx::FromRow)] +struct BackupSourceRow { + table_id: String, + table_arn: String, + key_schema: serde_json::Value, + attribute_definitions: serde_json::Value, + billing_mode: String, + table_size_bytes: i64, + item_count: i64, + provisioned_throughput: Option, + stream_specification: Option, + deletion_protection_enabled: bool, +} + +#[derive(sqlx::FromRow)] +struct BackupRestoreRow { + key_schema: serde_json::Value, + attribute_definitions: serde_json::Value, + billing_mode: String, + provisioned_throughput: Option, + item_count: i64, + backup_backend: String, + storage_uri: Option, + physical_table_name: Option, +} + +#[derive(sqlx::FromRow)] +struct BackupIndexSnapshotRow { + index_id: String, + index_name: String, + index_type: String, + key_schema: serde_json::Value, + projection: serde_json::Value, + provisioned_throughput: Option, +} + +struct BackupMetadataSnapshot { + source: BackupSourceRow, + indexes: Vec, + tags: Vec<(String, String)>, + native_snapshot_tso: i64, +} + +struct BackupInsert<'a> { + backup_arn: &'a str, + backup_name: &'a str, + table_name: &'a str, + account_id: &'a str, + snapshot: &'a BackupMetadataSnapshot, + storage_uri: &'a str, + physical_table: &'a str, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct TidbNativeBackupConfig { + binary: String, + component: Option, + pd_endpoint: Option, + storage_uri: Option, + send_credentials_to_tikv: Option, +} + +impl TidbNativeBackupConfig { + pub(crate) fn from_storage_config(config: NativeBackupConfig) -> Self { + Self { + binary: non_empty_string(config.binary).unwrap_or_else(|| "tiup".to_owned()), + component: match config.component { + Some(component) if component.trim().is_empty() => None, + Some(component) => Some(component), + None => Some("br".to_owned()), + }, + pd_endpoint: non_empty_string(config.coordinator_endpoint), + storage_uri: non_empty_string(config.storage_uri), + send_credentials_to_tikv: config.send_credentials_to_storage_nodes, + } + } + + fn require_snapshot(&self) -> Result, StorageError> { + let pd_endpoint = self.require_pd_endpoint("TiDB native backup")?; + let storage_uri = self.storage_uri.as_deref().ok_or_else(|| { + StorageError::Validation( + "TiDB native backup requires storage.tidb.backup.storage_uri".to_owned(), + ) + })?; + Ok(NativeSnapshotConfig { + pd_endpoint, + storage_uri, + }) + } + + fn require_pd_endpoint(&self, operation: &str) -> Result<&str, StorageError> { + self.pd_endpoint.as_deref().ok_or_else(|| { + StorageError::Validation(format!( + "{operation} requires storage.tidb.backup.pd_endpoint" + )) + }) + } + + fn command_args(&self, action: BrAction<'_>) -> Result, StorageError> { + let mut args = Vec::new(); + if let Some(component) = &self.component { + args.push(OsString::from(component)); + } + match action { + BrAction::BackupTable { + database, + table, + storage_uri, + backup_tso, + } => { + let snapshot = self.require_snapshot()?; + validate_br_name(database, "database")?; + validate_br_name(table, "table")?; + args.extend([ + "backup".into(), + "table".into(), + "--pd".into(), + snapshot.pd_endpoint.into(), + "--db".into(), + database.into(), + "--table".into(), + table.into(), + "--storage".into(), + storage_uri.into(), + "--backupts".into(), + backup_tso.to_string().into(), + ]); + if let Some(send) = self.send_credentials_to_tikv { + args.push(format!("--send-credentials-to-tikv={send}").into()); + } + } + BrAction::RestoreTable { + database, + table, + storage_uri, + } => { + let pd_endpoint = self.require_pd_endpoint("TiDB native restore")?; + validate_br_name(database, "database")?; + validate_br_name(table, "table")?; + args.extend([ + "restore".into(), + "table".into(), + "--pd".into(), + pd_endpoint.into(), + "--db".into(), + database.into(), + "--table".into(), + table.into(), + "--storage".into(), + storage_uri.into(), + ]); + if let Some(send) = self.send_credentials_to_tikv { + args.push(format!("--send-credentials-to-tikv={send}").into()); + } + } + } + Ok(args) + } + + async fn run(&self, action: BrAction<'_>) -> Result<(), StorageError> { + let args = self.command_args(action)?; + let mut command = Command::new(&self.binary); + command.args(&args); + let output = command + .output() + .await + .map_err(|e| StorageError::Internal(format!("Run TiDB BR: {e}")))?; + + if output.status.success() { + return Ok(()); + } + + let stderr = String::from_utf8_lossy(&output.stderr); + let stdout = String::from_utf8_lossy(&output.stdout); + Err(StorageError::Internal(format!( + "TiDB BR exited with {}: {}{}{}", + output.status, + truncate_output(stderr.trim()), + if stderr.trim().is_empty() || stdout.trim().is_empty() { + "" + } else { + " / " + }, + truncate_output(stdout.trim()) + ))) + } +} + +struct NativeSnapshotConfig<'a> { + pd_endpoint: &'a str, + storage_uri: &'a str, +} + +enum BrAction<'a> { + BackupTable { + database: &'a str, + table: &'a str, + storage_uri: &'a str, + backup_tso: i64, + }, + RestoreTable { + database: &'a str, + table: &'a str, + storage_uri: &'a str, + }, +} + +fn non_empty_string(value: Option) -> Option { + value.and_then(|s| { + let trimmed = s.trim(); + (!trimmed.is_empty()).then(|| trimmed.to_owned()) + }) +} + +fn truncate_output(value: &str) -> String { + const MAX: usize = 1_000; + if value.chars().count() <= MAX { + value.to_owned() + } else { + format!("{}...", value.chars().take(MAX).collect::()) + } +} + +fn backup_storage_uri(base: &str, account_id: &str, table_id: &str, millis: u128) -> String { + format!( + "{}/snapshots/{account_id}/{table_id}/{millis}", + base.trim_end_matches('/') + ) +} + +fn uri_is_under_base(base: &str, uri: &str) -> bool { + let base = base.trim_end_matches('/'); + uri == base + || uri + .strip_prefix(base) + .is_some_and(|rest| rest.starts_with('/')) +} + +fn local_backup_path(uri: &str) -> Option { + uri.strip_prefix("local://") + .or_else(|| uri.strip_prefix("file://")) + .filter(|path| path.starts_with('/')) + .map(PathBuf::from) +} + +fn validate_br_name(value: &str, label: &str) -> Result<(), StorageError> { + if value.is_empty() { + return Err(StorageError::Internal(format!("empty TiDB {label} name"))); + } + if value + .chars() + .any(|c| matches!(c, '*' | '?' | '[' | ']' | '!' | '`' | '\0') || c.is_whitespace()) + { + return Err(StorageError::Internal(format!( + "TiDB {label} name is not safe for BR table filters: {value}" + ))); + } + Ok(()) +} + +fn quote_identifier(value: &str, label: &str) -> Result { + if value.contains('`') || value.contains('\0') || !value.is_ascii() || value.is_empty() { + return Err(StorageError::Internal(format!( + "TiDB {label} name is not safe for SQL identifiers" + ))); + } + Ok(format!("`{value}`")) +} + +fn qualified_table(database: &str, table: &str) -> Result { + Ok(format!( + "{}.{}", + quote_identifier(database, "database")?, + quote_identifier(table, "table")? + )) +} + +fn parse_billing_mode(value: &str) -> Result { + match value { + "PAY_PER_REQUEST" => Ok(BillingMode::PayPerRequest), + "PROVISIONED" => Ok(BillingMode::Provisioned), + other => Err(StorageError::Internal(format!( + "Invalid backup billing mode: {other}" + ))), + } +} + +fn parse_json( + value: serde_json::Value, + label: &str, +) -> Result { + serde_json::from_value(value).map_err(|e| StorageError::Internal(format!("Parse {label}: {e}"))) +} + +fn parse_optional_json( + value: Option, + label: &str, +) -> Result, StorageError> { + match value { + None | Some(serde_json::Value::Null) => Ok(None), + Some(value) => parse_json(value, label).map(Some), + } +} + +fn parse_optional_index_provisioned_throughput( + value: Option, + label: &str, +) -> Result, StorageError> { + let Some(description) = parse_optional_json::< + extenddb_core::types::ProvisionedThroughputDescription, + >(value, label)? + else { + return Ok(None); + }; + Ok(Some(provisioned_throughput_from_description(&description))) +} + +fn backup_arn_account_id(backup_arn: &str) -> Result { + backup_arn + .split(':') + .nth(4) + .map(str::to_owned) + .ok_or_else(|| StorageError::Validation(format!("Invalid backup ARN: {backup_arn}"))) +} + +impl TidbEngine { + async fn current_tso(&self) -> Result { + sqlx::query_scalar("SELECT TIDB_CURRENT_TSO()") + .fetch_one(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}"))) + } + + async fn data_database_name(&self) -> Result { + let database: Option = sqlx::query_scalar("SELECT DATABASE()") + .fetch_one(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + database.ok_or_else(|| StorageError::Internal("TiDB data database not selected".to_owned())) + } + + async fn physical_table_exists( + &self, + database: &str, + table: &str, + ) -> Result { + sqlx::query_scalar( + "SELECT EXISTS(SELECT 1 FROM information_schema.tables \ + WHERE table_schema = ? AND table_name = ?)", + ) + .bind(database) + .bind(table) + .fetch_one(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}"))) + } + + async fn drop_physical_table_if_exists(&self, table: &str) { + let Ok(table) = quote_identifier(table, "table") else { + return; + }; + let sql = format!("DROP TABLE IF EXISTS {table}"); + if let Err(err) = sqlx::query(&sql).execute(&self.data_pool).await { + tracing::error!("failed to drop failed BR restore table '{table}': {err}"); + } + } + + async fn rename_physical_table(&self, from: &str, to: &str) -> Result<(), StorageError> { + let database = self.data_database_name().await?; + let from = qualified_table(&database, from)?; + let to = qualified_table(&database, to)?; + let sql = format!("RENAME TABLE {from} TO {to}"); + sqlx::query(&sql) + .execute(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + Ok(()) + } + + async fn snapshot_backup_metadata( + &self, + account_id: &str, + table_name: &str, + native_snapshot_tso: i64, + ) -> Result { + let source: BackupSourceRow = sqlx::query_as( + "SELECT table_id, table_arn, key_schema, attribute_definitions, billing_mode, \ + table_size_bytes, item_count, provisioned_throughput, stream_specification, \ + deletion_protection_enabled \ + FROM tables AS OF TIMESTAMP TIDB_PARSE_TSO(?) \ + WHERE account_id = ? AND table_name = ? AND table_status = 'ACTIVE'", + ) + .bind(native_snapshot_tso) + .bind(account_id) + .bind(table_name) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))? + .ok_or_else(|| StorageError::TableNotFound(format!("Table not found: {table_name}")))?; + + let indexes: Vec = sqlx::query_as( + "SELECT index_id, index_name, index_type, key_schema, projection, provisioned_throughput \ + FROM indexes AS OF TIMESTAMP TIDB_PARSE_TSO(?) \ + WHERE table_id = ? ORDER BY index_type, index_name", + ) + .bind(native_snapshot_tso) + .bind(&source.table_id) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + let tags = sqlx::query_as( + "SELECT tag_key, tag_value FROM tags \ + AS OF TIMESTAMP TIDB_PARSE_TSO(?) \ + WHERE resource_arn = ? ORDER BY tag_key", + ) + .bind(native_snapshot_tso) + .bind(&source.table_arn) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + Ok(BackupMetadataSnapshot { + source, + indexes, + tags, + native_snapshot_tso, + }) + } + + async fn insert_backup_metadata( + &self, + insert: BackupInsert<'_>, + ) -> Result { + let key_schema_json = insert.snapshot.source.key_schema.clone(); + let attr_defs_json = insert.snapshot.source.attribute_definitions.clone(); + let mut tx = self + .pool + .begin() + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + sqlx::query( + "INSERT INTO backups \ + (backup_arn, backup_name, table_id, table_name, account_id, backup_status, \ + backup_size_bytes, item_count, key_schema, attribute_definitions, billing_mode, \ + provisioned_throughput, stream_specification, deletion_protection_enabled, \ + backup_backend, storage_uri, physical_table_name, native_snapshot_tso) \ + VALUES (?, ?, ?, ?, ?, 'CREATING', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ) + .bind(insert.backup_arn) + .bind(insert.backup_name) + .bind(&insert.snapshot.source.table_id) + .bind(insert.table_name) + .bind(insert.account_id) + .bind(insert.snapshot.source.table_size_bytes) + .bind(insert.snapshot.source.item_count) + .bind(&key_schema_json) + .bind(&attr_defs_json) + .bind(&insert.snapshot.source.billing_mode) + .bind(&insert.snapshot.source.provisioned_throughput) + .bind(&insert.snapshot.source.stream_specification) + .bind(insert.snapshot.source.deletion_protection_enabled) + .bind(TIDB_BACKUP_BACKEND) + .bind(insert.storage_uri) + .bind(insert.physical_table) + .bind(insert.snapshot.native_snapshot_tso.to_string()) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + for index in &insert.snapshot.indexes { + sqlx::query( + "INSERT INTO backup_indexes \ + (backup_arn, index_id, index_name, index_type, key_schema, projection, provisioned_throughput) \ + VALUES (?, ?, ?, ?, ?, ?, ?)", + ) + .bind(insert.backup_arn) + .bind(&index.index_id) + .bind(&index.index_name) + .bind(&index.index_type) + .bind(&index.key_schema) + .bind(&index.projection) + .bind(&index.provisioned_throughput) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + } + + for (key, value) in &insert.snapshot.tags { + sqlx::query( + "INSERT INTO backup_tags (backup_arn, tag_key, tag_value) VALUES (?, ?, ?)", + ) + .bind(insert.backup_arn) + .bind(key) + .bind(value) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + } + + let created_at: time::OffsetDateTime = + sqlx::query_scalar("SELECT created_at FROM backups WHERE backup_arn = ?") + .bind(insert.backup_arn) + .fetch_one(&mut *tx) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + tx.commit() + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + Ok(created_at) + } + + async fn cleanup_failed_backup(&self, backup_arn: &str) { + for sql in [ + "DELETE FROM backup_indexes WHERE backup_arn = ?", + "DELETE FROM backup_tags WHERE backup_arn = ?", + "DELETE FROM backups WHERE backup_arn = ? AND backup_status = 'CREATING'", + ] { + if let Err(err) = sqlx::query(sql).bind(backup_arn).execute(&self.pool).await { + tracing::error!("failed to clean up incomplete backup '{backup_arn}': {err}"); + } + } + } + + async fn delete_native_backup_storage(&self, storage_uri: &str) -> Result<(), StorageError> { + let base = self.native_backup.storage_uri.as_deref().ok_or_else(|| { + StorageError::Validation( + "TiDB DeleteBackup requires storage.tidb.backup.storage_uri".to_owned(), + ) + })?; + if !uri_is_under_base(base, storage_uri) { + return Err(StorageError::Validation(format!( + "TiDB backup storage URI is outside configured backup root: {storage_uri}" + ))); + } + + let Some(path) = local_backup_path(storage_uri) else { + return Err(StorageError::Validation( + "TiDB BR does not manage remote backup deletion; DeleteBackup is supported only \ + for local:// or file:// backup storage without an object-store deleter" + .to_owned(), + )); + }; + + match tokio::fs::remove_dir_all(&path).await { + Ok(()) => Ok(()), + Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(()), + Err(err) => Err(StorageError::Internal(format!( + "Delete TiDB backup storage '{}': {err}", + path.display() + ))), + } + } + + async fn publish_backup(&self, backup_arn: &str) -> Result<(), StorageError> { + let result = sqlx::query( + "UPDATE backups SET backup_status = 'AVAILABLE' \ + WHERE backup_arn = ? AND backup_status = 'CREATING'", + ) + .bind(backup_arn) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + if result.rows_affected() != 1 { + return Err(StorageError::Internal(format!( + "backup was modified before publish: {backup_arn}" + ))); + } + Ok(()) + } + + async fn cleanup_failed_restore_table(&self, desc: &TableDescription) { + if let Err(err) = sqlx::query("DELETE FROM tags WHERE resource_arn = ?") + .bind(&desc.table_arn) + .execute(&self.pool) + .await + { + tracing::error!( + "failed to delete tags while cleaning failed restore for '{}': {err}", + desc.table_name + ); + } + + if let Err(err) = sqlx::query("DELETE FROM indexes WHERE table_id = ?") + .bind(&desc.table_id) + .execute(&self.pool) + .await + { + tracing::error!( + "failed to delete indexes while cleaning failed restore for '{}': {err}", + desc.table_name + ); + } + + if let Err(err) = sqlx::query("DELETE FROM tables WHERE table_id = ?") + .bind(&desc.table_id) + .execute(&self.pool) + .await + { + tracing::error!( + "failed to delete catalog row while cleaning failed restore for '{}': {err}", + desc.table_name + ); + } + + if let Err(err) = sqlx::query("DELETE FROM stream_shards WHERE table_id = ?") + .bind(&desc.table_id) + .execute(&self.data_pool) + .await + { + tracing::error!( + "failed to delete stream shards while cleaning failed restore '{}': {err}", + desc.table_name + ); + } + + let physical = physical_data_table_name(&desc.table_id); + self.drop_physical_table_if_exists(&physical).await; + } +} + +impl BackupEngine for TidbEngine { + fn create_backup( + &self, + account_id: &str, + table_name: &str, + backup_name: &str, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + let backup_name = backup_name.to_string(); + Box::pin(async move { + let snapshot_config = self.native_backup.require_snapshot()?; + let native_snapshot_tso = self.current_tso().await?; + let metadata = self + .snapshot_backup_metadata(&account_id, &table_name, native_snapshot_tso) + .await?; + let database = self.data_database_name().await?; + let physical_table = physical_data_table_name(&metadata.source.table_id); + let ts = epoch_millis(); + let backup_arn = format!( + "arn:aws:dynamodb:{}:{account_id}:table/{table_name}/backup/{ts}", + self.region + ); + let storage_uri = backup_storage_uri( + snapshot_config.storage_uri, + &account_id, + &metadata.source.table_id, + ts, + ); + let created_at = self + .insert_backup_metadata(BackupInsert { + backup_arn: &backup_arn, + backup_name: &backup_name, + table_name: &table_name, + account_id: &account_id, + snapshot: &metadata, + storage_uri: &storage_uri, + physical_table: &physical_table, + }) + .await?; + + let backup_result = self + .native_backup + .run(BrAction::BackupTable { + database: &database, + table: &physical_table, + storage_uri: &storage_uri, + backup_tso: metadata.native_snapshot_tso, + }) + .await; + if let Err(err) = backup_result { + self.cleanup_failed_backup(&backup_arn).await; + return Err(err); + } + if let Err(err) = self.publish_backup(&backup_arn).await { + self.cleanup_failed_backup(&backup_arn).await; + return Err(err); + } + + Ok(BackupDetails { + backup_arn, + backup_name, + backup_status: "AVAILABLE".to_owned(), + backup_type: "USER".to_owned(), + backup_size_bytes: metadata.source.table_size_bytes, + backup_creation_date_time: timestamp_to_epoch(created_at), + }) + }) + } + + fn describe_backup( + &self, + backup_arn: &str, + ) -> BoxFuture<'_, Result> { + let account_id = backup_arn_account_id(backup_arn); + let backup_arn = backup_arn.to_string(); + Box::pin(async move { + let account_id = account_id?; + #[derive(sqlx::FromRow)] + struct Row { + backup_name: String, + backup_status: String, + table_id: String, + table_name: String, + backup_size_bytes: i64, + item_count: i64, + key_schema: serde_json::Value, + billing_mode: String, + created_at: time::OffsetDateTime, + table_arn: Option, + backup_created_at: time::OffsetDateTime, + } + + let row: Row = sqlx::query_as( + "SELECT b.backup_name, b.backup_status, b.table_id, b.table_name, \ + b.backup_size_bytes, b.item_count, b.key_schema, b.billing_mode, \ + COALESCE(t.creation_date_time, b.created_at) as created_at, \ + t.table_arn, b.created_at as backup_created_at \ + FROM backups b \ + LEFT JOIN tables t ON t.table_id = b.table_id \ + WHERE b.backup_arn = ? AND b.account_id = ?", + ) + .bind(&backup_arn) + .bind(&account_id) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))? + .ok_or_else(|| StorageError::Validation(format!("Backup not found: {backup_arn}")))?; + + let key_schema: Vec = + parse_json(row.key_schema, "backup key schema")?; + let table_arn = row.table_arn.unwrap_or_else(|| { + format!( + "arn:aws:dynamodb:{}:{account_id}:table/{}", + self.region, row.table_name + ) + }); + + Ok(BackupDescription { + backup_details: BackupDetails { + backup_arn: backup_arn.to_owned(), + backup_name: row.backup_name, + backup_status: row.backup_status, + backup_type: "USER".to_owned(), + backup_size_bytes: row.backup_size_bytes, + backup_creation_date_time: timestamp_to_epoch(row.backup_created_at), + }, + source_table_details: SourceTableDetails { + table_name: row.table_name, + table_id: row.table_id, + table_arn, + key_schema, + item_count: row.item_count, + table_size_bytes: row.backup_size_bytes, + billing_mode: Some(row.billing_mode), + table_creation_date_time: timestamp_to_epoch(row.created_at), + }, + }) + }) + } + + fn list_backups( + &self, + account_id: &str, + table_name: Option<&str>, + ) -> BoxFuture<'_, Result, StorageError>> { + let account_id = account_id.to_string(); + let table_name = table_name.map(str::to_string); + Box::pin(async move { + let rows: Vec<( + String, + String, + String, + String, + String, + i64, + time::OffsetDateTime, + )> = if let Some(table) = &table_name { + sqlx::query_as( + "SELECT b.backup_arn, b.backup_name, b.table_name, b.backup_status, \ + COALESCE(t.table_arn, CONCAT('arn:aws:dynamodb:', ?, ':', b.account_id, \ + ':table/', b.table_name)) as table_arn, b.backup_size_bytes, b.created_at \ + FROM backups b \ + LEFT JOIN tables t ON t.table_id = b.table_id \ + WHERE b.account_id = ? AND b.table_name = ? AND b.backup_status != 'DELETED' \ + ORDER BY b.created_at DESC", + ) + .bind(&self.region) + .bind(&account_id) + .bind(table) + .fetch_all(&self.pool) + .await + } else { + sqlx::query_as( + "SELECT b.backup_arn, b.backup_name, b.table_name, b.backup_status, \ + COALESCE(t.table_arn, CONCAT('arn:aws:dynamodb:', ?, ':', b.account_id, \ + ':table/', b.table_name)) as table_arn, b.backup_size_bytes, b.created_at \ + FROM backups b \ + LEFT JOIN tables t ON t.table_id = b.table_id \ + WHERE b.account_id = ? AND b.backup_status != 'DELETED' \ + ORDER BY b.created_at DESC", + ) + .bind(&self.region) + .bind(&account_id) + .fetch_all(&self.pool) + .await + } + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + Ok(rows + .into_iter() + .map( + |(arn, name, table_name, status, table_arn, size, created_at)| BackupSummary { + backup_arn: arn, + backup_name: name, + table_name, + table_arn, + backup_status: status, + backup_type: "USER".to_owned(), + backup_size_bytes: size, + backup_creation_date_time: timestamp_to_epoch(created_at), + }, + ) + .collect()) + }) + } + + fn delete_backup( + &self, + backup_arn: &str, + ) -> BoxFuture<'_, Result> { + let account_id = backup_arn_account_id(backup_arn); + let backup_arn = backup_arn.to_string(); + Box::pin(async move { + let account_id = account_id?; + let desc = self.describe_backup(&backup_arn).await?; + let native_storage_uri: Option<(Option,)> = sqlx::query_as( + "SELECT storage_uri FROM backups \ + WHERE backup_arn = ? AND account_id = ? AND backup_backend = ?", + ) + .bind(&backup_arn) + .bind(&account_id) + .bind(TIDB_BACKUP_BACKEND) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + match native_storage_uri { + Some((Some(storage_uri),)) => { + self.delete_native_backup_storage(&storage_uri).await?; + } + Some((None,)) => { + return Err(StorageError::Internal(format!( + "Backup missing TiDB BR storage URI: {backup_arn}" + ))); + } + None => {} + } + + sqlx::query("DELETE FROM backup_indexes WHERE backup_arn = ?") + .bind(&backup_arn) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + sqlx::query("DELETE FROM backup_tags WHERE backup_arn = ?") + .bind(&backup_arn) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + sqlx::query( + "UPDATE backups SET backup_status = 'DELETED' \ + WHERE backup_arn = ? AND account_id = ?", + ) + .bind(&backup_arn) + .bind(&account_id) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + Ok(BackupDescription { + backup_details: BackupDetails { + backup_status: "DELETED".to_owned(), + ..desc.backup_details + }, + ..desc + }) + }) + } + + fn restore_table_from_backup( + &self, + account_id: &str, + target_table_name: &str, + backup_arn: &str, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + let target_table_name = target_table_name.to_string(); + let backup_arn = backup_arn.to_string(); + Box::pin(async move { + let backup_row: BackupRestoreRow = sqlx::query_as( + "SELECT key_schema, attribute_definitions, billing_mode, \ + provisioned_throughput, item_count, backup_backend, storage_uri, physical_table_name \ + FROM backups \ + WHERE backup_arn = ? AND account_id = ? AND backup_status = 'AVAILABLE'", + ) + .bind(&backup_arn) + .bind(&account_id) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))? + .ok_or_else(|| StorageError::Validation(format!("Backup not found: {backup_arn}")))?; + + if backup_row.backup_backend != TIDB_BACKUP_BACKEND { + return Err(StorageError::Validation( + "TiDB can restore only native BR backups".to_owned(), + )); + } + + let storage_uri = backup_row.storage_uri.ok_or_else(|| { + StorageError::Internal(format!("Backup missing TiDB BR storage URI: {backup_arn}")) + })?; + let source_physical_table = backup_row.physical_table_name.ok_or_else(|| { + StorageError::Internal(format!( + "Backup missing TiDB physical table name: {backup_arn}" + )) + })?; + + let backup_index_rows: Vec = sqlx::query_as( + "SELECT index_id, index_name, index_type, key_schema, projection, provisioned_throughput \ + FROM backup_indexes WHERE backup_arn = ? ORDER BY index_type, index_name", + ) + .bind(&backup_arn) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + let database = self.data_database_name().await?; + if self + .physical_table_exists(&database, &source_physical_table) + .await? + { + return Err(StorageError::Validation(format!( + "TiDB BR restores table backups to their original physical table name \ + ({source_physical_table}); restore requires an empty or conflict-free \ + TiDB target" + ))); + } + + let key_schema: Vec = + parse_json(backup_row.key_schema, "backup key schema")?; + let attr_defs = parse_json(backup_row.attribute_definitions, "backup attr defs")?; + let mut gsis = Vec::new(); + let mut lsis = Vec::new(); + for index in &backup_index_rows { + let index_key_schema = + parse_json(index.key_schema.clone(), "backup index key schema")?; + let projection = parse_json(index.projection.clone(), "backup index projection")?; + match index.index_type.as_str() { + "GSI" => gsis.push(GsiInput { + index_name: index.index_name.clone(), + key_schema: index_key_schema, + projection, + provisioned_throughput: parse_optional_index_provisioned_throughput( + index.provisioned_throughput.clone(), + "backup index provisioned throughput", + )?, + }), + "LSI" => lsis.push(LsiInput { + index_name: index.index_name.clone(), + key_schema: index_key_schema, + projection, + }), + other => { + return Err(StorageError::Internal(format!( + "Invalid backup index type: {other}" + ))); + } + } + } + + let create_input = CreateTableInput { + table_name: target_table_name.to_owned(), + key_schema, + attribute_definitions: attr_defs, + billing_mode: Some(parse_billing_mode(&backup_row.billing_mode)?), + provisioned_throughput: parse_optional_json::( + backup_row.provisioned_throughput, + "backup provisioned throughput", + )?, + global_secondary_indexes: (!gsis.is_empty()).then_some(gsis), + local_secondary_indexes: (!lsis.is_empty()).then_some(lsis), + stream_specification: None, + tags: None, + deletion_protection_enabled: Some(false), + sse_specification: None, + table_class: None, + }; + + let desc = self + .create_table_impl_with_activation( + &account_id, + create_input, + CreateTableActivation::Deferred, + ) + .await?; + + let restore_result = async { + for index in &backup_index_rows { + sqlx::query( + "UPDATE indexes SET index_id = ? \ + WHERE table_id = ? AND index_name = ?", + ) + .bind(&index.index_id) + .bind(&desc.table_id) + .bind(&index.index_name) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + } + + self.native_backup + .run(BrAction::RestoreTable { + database: &database, + table: &source_physical_table, + storage_uri: &storage_uri, + }) + .await?; + + let target_physical_table = physical_data_table_name(&desc.table_id); + self.rename_physical_table(&source_physical_table, &target_physical_table) + .await?; + + // DynamoDB restores table data, not TTL settings. BR restores the + // source table's physical shape, so normalize restored TiDB TTL + // artifacts before the catalog row becomes ACTIVE. + drop_ttl_artifacts(&self.data_pool, &desc.table_id).await?; + + sqlx::query( + "UPDATE tables SET item_count = ?, table_status = 'ACTIVE', \ + status_transition_at = NULL WHERE table_id = ?", + ) + .bind(backup_row.item_count) + .bind(&desc.table_id) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + Ok::<(), StorageError>(()) + } + .await; + + if let Err(err) = restore_result { + self.drop_physical_table_if_exists(&source_physical_table) + .await; + self.cleanup_failed_restore_table(&desc).await; + return Err(err); + } + + Ok(desc) + }) + } + + fn describe_continuous_backups( + &self, + account_id: &str, + table_name: &str, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + Box::pin(async move { + let exists: bool = sqlx::query_scalar( + "SELECT EXISTS(SELECT 1 FROM tables WHERE account_id = ? AND table_name = ?)", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_one(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + if !exists { + return Err(StorageError::TableNotFound(format!( + "Table not found: {table_name}" + ))); + } + + let pitr_row: Option<( + bool, + Option, + Option, + )> = sqlx::query_as( + "SELECT pitr_enabled, earliest_restorable, latest_restorable \ + FROM continuous_backups WHERE account_id = ? AND table_name = ?", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + let (pitr_enabled, earliest, latest) = pitr_row + .map_or((false, None, None), |(enabled, earliest, latest)| { + (enabled, earliest, latest) + }); + + Ok(ContinuousBackupsDescription { + continuous_backups_status: "ENABLED".to_owned(), + point_in_time_recovery_description: Some(PointInTimeRecoveryDescription { + point_in_time_recovery_status: if pitr_enabled { + "ENABLED".to_owned() + } else { + "DISABLED".to_owned() + }, + earliest_restorable_date_time: earliest.map(timestamp_to_epoch), + latest_restorable_date_time: latest.map(timestamp_to_epoch), + }), + }) + }) + } + + fn update_continuous_backups( + &self, + account_id: &str, + table_name: &str, + pitr_enabled: bool, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + Box::pin(async move { + let exists: bool = sqlx::query_scalar( + "SELECT EXISTS(SELECT 1 FROM tables WHERE account_id = ? AND table_name = ?)", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_one(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + if !exists { + return Err(StorageError::TableNotFound(format!( + "Table not found: {table_name}" + ))); + } + + if pitr_enabled { + return Err(StorageError::Validation( + "TiDB table-level point-in-time recovery is not supported; \ + TiDB BR PITR restores into an empty or conflict-free target cluster" + .to_owned(), + )); + } + + sqlx::query( + "INSERT INTO continuous_backups \ + (account_id, table_name, pitr_enabled, earliest_restorable, latest_restorable) \ + VALUES (?, ?, ?, NULL, NULL) \ + ON DUPLICATE KEY UPDATE pitr_enabled = VALUES(pitr_enabled)", + ) + .bind(&account_id) + .bind(&table_name) + .bind(pitr_enabled) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(format!("Database error: {e}")))?; + + self.describe_continuous_backups(&account_id, &table_name) + .await + }) + } + + fn restore_table_to_point_in_time( + &self, + _account_id: &str, + _source_table_name: &str, + _target_table_name: &str, + ) -> BoxFuture<'_, Result> { + Box::pin(async move { + Err(StorageError::Validation( + "TiDB BR PITR restores to an empty/conflict-free cluster; \ + table-level PITR into a live ExtendDB table is not supported" + .to_owned(), + )) + }) + } +} + +#[cfg(test)] +mod tests { + use super::{ + BrAction, TidbNativeBackupConfig, backup_storage_uri, local_backup_path, uri_is_under_base, + }; + use extenddb_storage::config::NativeBackupConfig; + + #[test] + fn builds_tiup_br_backup_command() { + let cfg = TidbNativeBackupConfig::from_storage_config(NativeBackupConfig { + coordinator_endpoint: Some("127.0.0.1:2379".to_owned()), + storage_uri: Some("s3://bucket/extenddb".to_owned()), + send_credentials_to_storage_nodes: Some(false), + ..NativeBackupConfig::default() + }); + + let args = cfg + .command_args(BrAction::BackupTable { + database: "extenddb_data", + table: "_ddb_123", + storage_uri: "s3://bucket/extenddb/snapshots/a/t/1", + backup_tso: 450456244814610433, + }) + .expect("command should build"); + let rendered: Vec<_> = args.iter().map(|arg| arg.to_string_lossy()).collect(); + assert_eq!( + rendered, + vec![ + "br", + "backup", + "table", + "--pd", + "127.0.0.1:2379", + "--db", + "extenddb_data", + "--table", + "_ddb_123", + "--storage", + "s3://bucket/extenddb/snapshots/a/t/1", + "--backupts", + "450456244814610433", + "--send-credentials-to-tikv=false", + ] + ); + } + + #[test] + fn builds_direct_br_restore_command() { + let cfg = TidbNativeBackupConfig::from_storage_config(NativeBackupConfig { + binary: Some("br".to_owned()), + component: Some(String::new()), + coordinator_endpoint: Some("pd:2379".to_owned()), + storage_uri: Some("local:///backup".to_owned()), + send_credentials_to_storage_nodes: Some(false), + ..NativeBackupConfig::default() + }); + + let args = cfg + .command_args(BrAction::RestoreTable { + database: "extenddb_data", + table: "_ddb_source", + storage_uri: "local:///backup/snapshots/a/t/1", + }) + .expect("command should build"); + let rendered: Vec<_> = args.iter().map(|arg| arg.to_string_lossy()).collect(); + assert_eq!( + rendered, + vec![ + "restore", + "table", + "--pd", + "pd:2379", + "--db", + "extenddb_data", + "--table", + "_ddb_source", + "--storage", + "local:///backup/snapshots/a/t/1", + "--send-credentials-to-tikv=false", + ] + ); + } + + #[test] + fn backup_uri_is_stable_under_base_slashes() { + assert_eq!( + backup_storage_uri("s3://bucket/root/", "acct", "table-id", 42), + "s3://bucket/root/snapshots/acct/table-id/42" + ); + } + + #[test] + fn delete_uri_must_stay_under_configured_backup_root() { + assert!(uri_is_under_base( + "local:///var/lib/extenddb/backups/", + "local:///var/lib/extenddb/backups/snapshots/a/t/1" + )); + assert!(!uri_is_under_base( + "local:///var/lib/extenddb/backups", + "local:///var/lib/extenddb/backups-other/snapshots/a/t/1" + )); + } + + #[test] + fn local_backup_path_accepts_only_absolute_local_uris() { + assert_eq!( + local_backup_path("local:///tmp/extenddb-backups/a").as_deref(), + Some(std::path::Path::new("/tmp/extenddb-backups/a")) + ); + assert!(local_backup_path("s3://bucket/backup").is_none()); + assert!(local_backup_path("local://relative/path").is_none()); + } +} diff --git a/crates/storage-tidb/src/bootstrapper.rs b/crates/storage-tidb/src/bootstrapper.rs new file mode 100755 index 0000000..c4b0c89 --- /dev/null +++ b/crates/storage-tidb/src/bootstrapper.rs @@ -0,0 +1,609 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! TiDB implementation of `Bootstrapper`. +//! +//! Handles `CREATE DATABASE`, schema migrations, user provisioning, and +//! teardown using TiDB-specific DDL. Connection pools are created +//! lazily as needed during the bootstrap sequence. + +use async_trait::async_trait; +use extenddb_storage::bootstrapper::{ + AdminBootstrapResult, BootstrapConfig, BootstrapOptions, Bootstrapper, +}; +use extenddb_storage::management_store::{OpError, OpResult}; +use sqlx::MySqlPool; +use sqlx::mysql::{MySqlConnectOptions, MySqlPoolOptions}; +use tokio::sync::OnceCell; + +use crate::CATALOG_VERSION; +use crate::migrations; + +/// Utilities for bootstrapping a TiDB backend store. +/// +/// Holds the bootstrap configuration and lazily-created connection pools. +/// The admin pool connects without selecting a database and is created lazily +/// on first use. Commands that only need the catalog database (e.g. `migrate`) +/// never open an admin connection. +pub struct TidbBootstrapper { + config: BootstrapConfig, + admin_pool: OnceCell, +} + +impl TidbBootstrapper { + /// Create a new bootstrapper. The admin pool is created lazily on + /// first use, so this constructor never opens a database connection. + pub fn new(config: BootstrapConfig) -> Self { + Self { + config, + admin_pool: OnceCell::new(), + } + } + + /// Connect to TiDB as the admin user eagerly. + /// Equivalent to `new()` followed by an immediate admin pool init. + pub async fn connect(config: BootstrapConfig) -> OpResult { + let store = Self::new(config); + // Force admin pool creation to fail fast on connection errors. + store.admin_pool().await?; + Ok(store) + } + + /// Get or create the admin pool. + async fn admin_pool(&self) -> OpResult<&MySqlPool> { + self.admin_pool + .get_or_try_init(|| async { + let opts = MySqlConnectOptions::new() + .host(&self.config.host) + .port(self.config.port) + .username(&self.config.admin_user); + let opts = if let Some(ref pass) = self.config.admin_password { + opts.password(pass) + } else { + opts + }; + MySqlPoolOptions::new() + .max_connections(1) + .connect_with(opts) + .await + .map_err(|e| OpError::Internal(format!("Cannot connect as admin: {e}"))) + }) + .await + } + + /// Build `MySqlConnectOptions` for the application user connecting to a named database. + fn app_connect_opts(&self, database: &str) -> MySqlConnectOptions { + MySqlConnectOptions::new() + .host(&self.config.host) + .port(self.config.port) + .username(&self.config.app_user) + .password(&self.config.app_password) + .database(database) + } + + /// Build the connection URL for the application user and a named database. + fn app_connection_url(&self, database: &str) -> String { + crate::config::connection_url( + &self.config.app_user, + &self.config.app_password, + &self.config.host, + self.config.port, + database, + ) + } + + /// Open a one-shot pool to the given database as the application user. + async fn app_pool(&self, database: &str) -> OpResult { + MySqlPoolOptions::new() + .max_connections(1) + .connect_with(self.app_connect_opts(database)) + .await + .map_err(|e| OpError::Internal(format!("Cannot connect to {database}: {e}"))) + } + + /// Return the catalog connection URL (for config file generation). + pub fn catalog_connection_url(&self) -> String { + self.app_connection_url(&self.config.catalog_db) + } +} + +#[async_trait] +impl Bootstrapper for TidbBootstrapper { + async fn ensure_app_user(&self) -> OpResult<()> { + let user = &self.config.app_user; + let password = &self.config.app_password; + let admin = self.admin_pool().await?; + + println!("--- Ensuring application user '{user}' exists..."); + let exists: bool = + sqlx::query_scalar("SELECT EXISTS(SELECT 1 FROM mysql.user WHERE User = ?)") + .bind(user) + .fetch_one(admin) + .await + .map_err(|e| OpError::Internal(format!("Check user exists: {e}")))?; + + if exists { + println!(" User '{user}' already exists."); + return Ok(()); + } + + // CREATE USER doesn't support parameterized account names/passwords in + // TiDB/MySQL DDL, so keep a strict allowlist before formatting. + // Strict allowlist prevents SQL injection via backslash, NUL, semicolon, newline. + if !user + .chars() + .all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '-')) + { + return Err(OpError::Validation( + "Application user contains disallowed characters. \ + Only ASCII letters, digits, underscore, and hyphen are permitted." + .to_owned(), + )); + } + if !password + .chars() + .all(|c| c.is_ascii_alphanumeric() || "-_.,!@#$%^&*()+=~` ".contains(c)) + { + return Err(OpError::Validation( + "Application password contains disallowed characters. \ + Only ASCII letters, digits, and -_.,!@#$%^&*()+=~` space are permitted." + .to_owned(), + )); + } + let sql = format!("CREATE USER IF NOT EXISTS '{user}'@'%' IDENTIFIED BY '{password}'"); + sqlx::query(&sql) + .execute(admin) + .await + .map_err(|e| OpError::Internal(format!("Create user: {e}")))?; + println!(" Created user '{user}'."); + Ok(()) + } + + async fn grant_app_role_to_admin(&self) -> OpResult<()> { + Ok(()) + } + + async fn create_catalog_db(&self) -> OpResult<()> { + create_database( + self.admin_pool().await?, + &self.config.catalog_db, + &self.config.app_user, + ) + .await + } + + async fn create_data_db(&self) -> OpResult<()> { + create_database( + self.admin_pool().await?, + &self.config.data_db, + &self.config.app_user, + ) + .await + } + + async fn run_catalog_migrations(&self) -> OpResult<()> { + let pool = self.app_pool(&self.config.catalog_db).await?; + migrations::run_catalog_migrations(&pool).await + } + + async fn run_data_migrations(&self) -> OpResult<()> { + let pool = self.app_pool(&self.config.data_db).await?; + migrations::run_data_migrations(&pool).await + } + + async fn record_data_connection(&self) -> OpResult<()> { + let pool = self.app_pool(&self.config.catalog_db).await?; + let data_conn = self.app_connection_url(&self.config.data_db); + + println!("--- Recording data database connection in catalog..."); + sqlx::query( + "INSERT INTO settings (`key`, value) VALUES ('data_database_connection_string', ?) \ + ON DUPLICATE KEY UPDATE value = VALUES(value)", + ) + .bind(&data_conn) + .execute(&pool) + .await + .map_err(|e| OpError::Internal(format!("Record data connection: {e}")))?; + + sqlx::query( + "INSERT INTO settings (`key`, value) VALUES ('data_database_name', ?) \ + ON DUPLICATE KEY UPDATE value = VALUES(value)", + ) + .bind(&self.config.data_db) + .execute(&pool) + .await + .map_err(|e| OpError::Internal(format!("Record data db name: {e}")))?; + + Ok(()) + } + + async fn bootstrap_encryption_key(&self) -> OpResult<()> { + use aes_gcm::KeyInit; + use base64::Engine; + + let pool = self.app_pool(&self.config.catalog_db).await?; + let exists: bool = sqlx::query_scalar( + "SELECT EXISTS(SELECT 1 FROM settings WHERE `key` = 'encryption_key')", + ) + .fetch_one(&pool) + .await + .map_err(|e| OpError::Internal(format!("Check encryption key: {e}")))?; + + if exists { + println!("--- Encryption key already exists, skipping."); + return Ok(()); + } + + println!("--- Generating AES-256-GCM encryption key..."); + let key = aes_gcm::Aes256Gcm::generate_key(&mut aes_gcm::aead::OsRng); + let key_b64 = base64::engine::general_purpose::STANDARD.encode(key); + + sqlx::query("INSERT IGNORE INTO settings (`key`, value) VALUES ('encryption_key', ?)") + .bind(&key_b64) + .execute(&pool) + .await + .map_err(|e| OpError::Internal(format!("Store encryption key: {e}")))?; + + println!(" Encryption key stored."); + Ok(()) + } + + async fn bootstrap_default_account(&self) -> OpResult<()> { + let pool = self.app_pool(&self.config.catalog_db).await?; + let exists: bool = sqlx::query_scalar("SELECT EXISTS(SELECT 1 FROM accounts)") + .fetch_one(&pool) + .await + .map_err(|e| OpError::Internal(format!("Check accounts: {e}")))?; + + if exists { + println!("--- Default account already exists, skipping."); + return Ok(()); + } + + let account_id = generate_account_id(); + println!("--- Creating default account '{account_id}'..."); + sqlx::query( + "INSERT INTO accounts (account_id, account_name) VALUES (?, ?) \ + ON DUPLICATE KEY UPDATE account_id = account_id", + ) + .bind(&account_id) + .bind("default") + .execute(&pool) + .await + .map_err(|e| OpError::Internal(format!("Create account: {e}")))?; + + println!(" Account ID: {account_id}"); + Ok(()) + } + + async fn bootstrap_admin_user( + &self, + env_user: Option<&str>, + env_password: Option<&str>, + ) -> OpResult { + let pool = self.app_pool(&self.config.catalog_db).await?; + let admin_name = env_user.filter(|s| !s.is_empty()).unwrap_or("admin"); + + let exists: bool = + sqlx::query_scalar("SELECT EXISTS(SELECT 1 FROM admin_users WHERE admin_name = ?)") + .bind(admin_name) + .fetch_one(&pool) + .await + .map_err(|e| OpError::Internal(format!("Check admin user: {e}")))?; + + if exists { + println!("--- Admin user '{admin_name}' already exists, skipping."); + return Ok(AdminBootstrapResult { + username: admin_name.to_owned(), + generated_password: None, + already_existed: true, + from_env: false, + }); + } + + println!("--- Creating admin user '{admin_name}'..."); + let (password, from_env) = match env_password { + Some(p) if !p.is_empty() => (p.to_owned(), true), + _ => (generate_random_password(), false), + }; + let pw_clone = password.clone(); + let hash = + tokio::task::spawn_blocking(move || bcrypt::hash(pw_clone, bcrypt::DEFAULT_COST)) + .await + .map_err(|e| OpError::Internal(format!("bcrypt hash task failed: {e}")))? + .map_err(|e| OpError::Internal(format!("bcrypt hash failed: {e}")))?; + + sqlx::query( + "INSERT INTO admin_users (admin_name, password_hash) VALUES (?, ?) \ + ON DUPLICATE KEY UPDATE admin_name = admin_name", + ) + .bind(admin_name) + .bind(&hash) + .execute(&pool) + .await + .map_err(|e| OpError::Internal(format!("Create admin user: {e}")))?; + + Ok(AdminBootstrapResult { + username: admin_name.to_owned(), + generated_password: if from_env { None } else { Some(password) }, + already_existed: false, + from_env, + }) + } + + async fn is_catalog_initialized(&self) -> OpResult { + let pool = self.app_pool(&self.config.catalog_db).await?; + migrations::table_exists(&pool, "settings").await + } + + async fn list_table_names(&self) -> OpResult> { + let pool = match self.app_pool(&self.config.catalog_db).await { + Ok(p) => p, + Err(_) => return Ok(Vec::new()), + }; + let tables: Vec<(String,)> = + sqlx::query_as("SELECT table_name FROM tables ORDER BY table_name") + .fetch_all(&pool) + .await + .unwrap_or_default(); + Ok(tables.into_iter().map(|(n,)| n).collect()) + } + + async fn get_data_db_name(&self) -> OpResult> { + let pool = match self.app_pool(&self.config.catalog_db).await { + Ok(p) => p, + Err(_) => return Ok(None), + }; + let row = sqlx::query_as::<_, (String,)>( + "SELECT value FROM settings WHERE `key` = 'data_database_name'", + ) + .fetch_optional(&pool) + .await + .unwrap_or(None); + Ok(row.map(|(v,)| v)) + } + + async fn drop_databases(&self, data_db: &str) -> OpResult<()> { + let admin = self.admin_pool().await?; + if !data_db.is_empty() { + println!("--- Dropping data database '{data_db}'..."); + let sql = format!("DROP DATABASE IF EXISTS {}", quote_identifier(data_db)?); + sqlx::query(&sql) + .execute(admin) + .await + .map_err(|e| OpError::Internal(format!("Drop data database: {e}")))?; + } + + let catalog = &self.config.catalog_db; + println!("--- Dropping catalog database '{catalog}'..."); + let sql = format!("DROP DATABASE IF EXISTS {}", quote_identifier(catalog)?); + sqlx::query(&sql) + .execute(admin) + .await + .map_err(|e| OpError::Internal(format!("Drop catalog database: {e}")))?; + + Ok(()) + } + + async fn read_catalog_version(&self) -> OpResult> { + let pool = self.app_pool(&self.config.catalog_db).await?; + + if !migrations::table_exists(&pool, "settings").await? { + return Ok(None); + } + + let row = sqlx::query_as::<_, (String,)>( + "SELECT value FROM settings WHERE `key` = 'catalog_version'", + ) + .fetch_optional(&pool) + .await + .map_err(|e| OpError::Internal(format!("Read catalog version: {e}")))?; + + Ok(row.map(|(v,)| v)) + } + + fn expected_catalog_version(&self) -> String { + CATALOG_VERSION.to_string() + } + + fn catalog_database_name(&self) -> String { + self.config.catalog_db.clone() + } + + fn endpoint_info(&self) -> String { + format!("{}:{}", self.config.host, self.config.port) + } + + fn catalog_connection_url(&self) -> String { + self.app_connection_url(&self.config.catalog_db) + } +} + +// ── Helpers ──────────────────────────────────────────────────────────── + +/// Create a database, aborting if it already exists. +async fn create_database(pool: &MySqlPool, name: &str, owner: &str) -> OpResult<()> { + println!("--- Creating database '{name}'..."); + let exists: bool = sqlx::query_scalar( + "SELECT EXISTS(SELECT 1 FROM information_schema.schemata WHERE schema_name = ?)", + ) + .bind(name) + .fetch_one(pool) + .await + .map_err(|e| OpError::Internal(format!("Check database exists: {e}")))?; + + if exists { + return Err(OpError::AlreadyExists(format!( + "Database '{name}' already exists. Run 'destroy' first, then re-run 'init'." + ))); + } + + // CREATE DATABASE doesn't support parameterized names. + let sql = format!("CREATE DATABASE {}", quote_identifier(name)?); + sqlx::query(&sql) + .execute(pool) + .await + .map_err(|e| OpError::Internal(format!("Create database '{name}': {e}")))?; + if owner + .chars() + .all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '-')) + { + let grant_sql = format!( + "GRANT ALL PRIVILEGES ON {}.* TO '{}'@'%'", + quote_identifier(name)?, + owner + ); + sqlx::query(&grant_sql) + .execute(pool) + .await + .map_err(|e| OpError::Internal(format!("Grant database '{name}': {e}")))?; + } + println!(" Created."); + Ok(()) +} + +fn quote_identifier(name: &str) -> OpResult { + if name.contains('`') || name.contains('\0') || !name.is_ascii() { + return Err(OpError::Validation( + "Database name contains invalid characters for TiDB identifiers".to_owned(), + )); + } + Ok(format!("`{name}`")) +} + +/// Generate a random 12-digit numeric account ID (matches AWS account ID format). +fn generate_account_id() -> String { + use rand::Rng; + let mut rng = rand::rng(); + let id: u64 = rng.random_range(100_000_000_000..1_000_000_000_000); + id.to_string() +} + +/// Generate a 24-character random password using alphanumeric characters only. +/// +/// Restricted to `[a-zA-Z0-9]` to avoid URL-encoding issues in form submissions, +/// shell copy-paste problems, and other contexts where special characters break. +/// At 24 characters from a 62-char alphabet, entropy is ~143 bits — more than sufficient. +fn generate_random_password() -> String { + use rand::Rng; + const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + let mut rng = rand::rng(); + (0..24) + .map(|_| CHARSET[rng.random_range(0..CHARSET.len())] as char) + .collect() +} + +impl TidbBootstrapper { + /// Create a bootstrapper from config file and typed CLI options. + pub async fn from_config( + config_path: &str, + options: BootstrapOptions, + ) -> Result { + use extenddb_storage::error::StorageError; + + // Load config file if it exists + let (host, port, user, password, catalog_db_name) = if std::path::Path::new(config_path) + .exists() + { + println!("--- Loading defaults from {}", config_path); + + // Parse connection string from config + let config_content = std::fs::read_to_string(config_path) + .map_err(|e| StorageError::Internal(format!("Failed to read config: {e}")))?; + let app_config: toml::Value = toml::from_str(&config_content) + .map_err(|e| StorageError::Internal(format!("Failed to parse config: {e}")))?; + + let conn_str = app_config + .get("storage") + .and_then(|s| s.get("tidb")) + .and_then(|p| p.get("connection_string")) + .and_then(|c| c.as_str()) + .ok_or_else(|| { + StorageError::Internal("Missing storage.tidb.connection_string".into()) + })?; + + let parts = crate::config::parse_connection_string(conn_str) + .map_err(|e| StorageError::Internal(format!("Invalid connection string: {e}")))?; + + // Check for conflicts between CLI args and config values + check_conflict(options.storage_host.as_ref(), &parts.host, "--storage-host")?; + check_conflict(options.storage_port.as_ref(), &parts.port, "--storage-port")?; + check_conflict(options.app_user.as_ref(), &parts.user, "--extenddb-user")?; + check_conflict( + options.app_password.as_ref(), + &parts.password, + "--extenddb-pass", + )?; + + if let Some(ref cli_catalog) = options.catalog_db { + if cli_catalog != &parts.database { + return Err(StorageError::Internal(format!( + "--catalog-db '{}' conflicts with config file catalog database '{}'", + cli_catalog, parts.database + ))); + } + } + + ( + parts.host, + parts.port, + parts.user, + parts.password, + parts.database, + ) + } else { + // No config file - use defaults + ( + "localhost".to_string(), + 4000, + "extenddb".to_string(), + "extenddb-local-dev".to_string(), + "extenddb_catalog".to_string(), + ) + }; + + // CLI args override config (or use config values if no CLI arg provided) + let resolved_host = options.storage_host.unwrap_or(host); + let resolved_port = options.storage_port.unwrap_or(port); + let resolved_admin_user = options.admin_user.unwrap_or_else(|| "root".to_owned()); + let resolved_catalog_db = options.catalog_db.unwrap_or(catalog_db_name); + let final_data_db = options.data_db.unwrap_or_else(|| { + resolved_catalog_db + .strip_suffix("_catalog") + .unwrap_or(&resolved_catalog_db) + .to_owned() + }); + let resolved_app_user = options.app_user.unwrap_or(user); + let resolved_app_password = options.app_password.unwrap_or(password); + + let config = BootstrapConfig { + host: resolved_host, + port: resolved_port, + admin_user: resolved_admin_user, + admin_password: options.admin_password, + app_user: resolved_app_user, + app_password: resolved_app_password, + catalog_db: resolved_catalog_db, + data_db: final_data_db, + }; + + Ok(Self::new(config)) + } +} + +/// Check that a CLI arg, if provided, matches the config value. +fn check_conflict( + cli_val: Option<&T>, + config_val: &T, + flag: &str, +) -> Result<(), extenddb_storage::error::StorageError> { + if let Some(v) = cli_val { + if v != config_val { + return Err(extenddb_storage::error::StorageError::Internal(format!( + "{} value '{}' conflicts with config file value '{}'", + flag, v, config_val + ))); + } + } + Ok(()) +} diff --git a/crates/storage-tidb/src/catalog_store.rs b/crates/storage-tidb/src/catalog_store.rs new file mode 100755 index 0000000..9533994 --- /dev/null +++ b/crates/storage-tidb/src/catalog_store.rs @@ -0,0 +1,399 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! TiDB implementations of `SettingsStore`, `MetricsStore`, and +//! `RateLimitStore`. +//! +//! `TidbCatalogStore` wraps a `MySqlPool` connected to the catalog database +//! and implements the three operational traits defined in `extenddb_storage`. +//! This decouples callers from direct `sqlx::MySqlPool` usage, enabling +//! alternative storage backends. + +use std::sync::Arc; + +use extenddb_storage::management_store::{MetricsRow, OpError, OpResult}; +use futures::future::BoxFuture; +use sqlx::MySqlPool; + +/// TiDB-backed catalog store for settings, metrics, and rate limiting. +/// +/// Holds a connection pool to the catalog database. Created once at startup +/// and shared (via `Arc`) across management API handlers and background workers. +pub struct TidbCatalogStore { + pool: MySqlPool, + /// P119: Cached encryption key (immutable after bootstrap). Avoids + /// per-request DB query on access key and assume-role operations. + encryption_key: Option>, +} + +impl TidbCatalogStore { + /// Create a new catalog store wrapping the given pool. + pub fn new(pool: MySqlPool) -> Self { + Self { + pool, + encryption_key: None, + } + } + + /// Create a new catalog store with a pre-loaded encryption key (P119). + pub fn with_encryption_key(pool: MySqlPool, encryption_key: String) -> Self { + Self { + pool, + encryption_key: Some(Arc::from(encryption_key.as_str())), + } + } + + /// Borrow the underlying pool (escape hatch for callers not yet migrated). + pub fn pool(&self) -> &MySqlPool { + &self.pool + } + + /// Get the cached encryption key. Returns `None` if not loaded at startup. + pub fn encryption_key(&self) -> Option<&Arc> { + self.encryption_key.as_ref() + } +} + +// ── SettingsStore ────────────────────────────────────────────────────── + +impl extenddb_storage::management_store::SettingsStore for TidbCatalogStore { + fn get_setting(&self, key: &str) -> futures::future::BoxFuture<'_, OpResult>> { + let key = key.to_string(); + let pool = self.pool.clone(); + Box::pin(async move { + let row: Option<(String,)> = + sqlx::query_as("SELECT value FROM settings WHERE `key` = ?") + .bind(&key) + .fetch_optional(&pool) + .await + .map_err(|e| { + tracing::error!("get_setting: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(row.map(|(v,)| v)) + }) + } + + fn set_setting(&self, key: &str, value: &str) -> futures::future::BoxFuture<'_, OpResult<()>> { + let key = key.to_string(); + let value = value.to_string(); + let pool = self.pool.clone(); + Box::pin(async move { + sqlx::query( + "INSERT INTO settings (`key`, value) VALUES (?, ?) \ + ON DUPLICATE KEY UPDATE value = VALUES(value)", + ) + .bind(&key) + .bind(&value) + .execute(&pool) + .await + .map_err(|e| { + tracing::error!("set_setting: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(()) + }) + } + + fn list_settings(&self) -> futures::future::BoxFuture<'_, OpResult>> { + let pool = self.pool.clone(); + Box::pin(async move { + sqlx::query_as("SELECT `key`, value FROM settings ORDER BY `key`") + .fetch_all(&pool) + .await + .map_err(|e| { + tracing::error!("list_settings: {e}"); + OpError::Internal("Database error".to_owned()) + }) + }) + } + + fn cached_encryption_key(&self) -> Option { + self.encryption_key.as_ref().map(|k| k.to_string()) + } +} + +// ── DiagnosticsStore ─────────────────────────────────────────────────── + +impl extenddb_storage::diagnostics::DiagnosticsStore for TidbCatalogStore { + fn count_tables( + &self, + ) -> futures::future::BoxFuture<'_, extenddb_storage::diagnostics::DiagResult> { + let pool = self.pool.clone(); + Box::pin(async move { + let (count,): (i64,) = sqlx::query_as("SELECT COUNT(*) FROM tables") + .fetch_one(&pool) + .await + .map_err(|e| { + extenddb_storage::diagnostics::DiagError::QueryFailed(e.to_string()) + })?; + Ok(count) + }) + } + + fn count_indexes( + &self, + ) -> futures::future::BoxFuture<'_, extenddb_storage::diagnostics::DiagResult> { + let pool = self.pool.clone(); + Box::pin(async move { + let (count,): (i64,) = sqlx::query_as("SELECT COUNT(*) FROM indexes") + .fetch_one(&pool) + .await + .map_err(|e| { + extenddb_storage::diagnostics::DiagError::QueryFailed(e.to_string()) + })?; + Ok(count) + }) + } + + fn test_data_database_connection( + &self, + ) -> futures::future::BoxFuture<'_, extenddb_storage::diagnostics::DiagResult> { + let pool = self.pool.clone(); + Box::pin(async move { + // Get data database connection string and name from settings + let conn_row: Option<(String,)> = sqlx::query_as( + "SELECT value FROM settings WHERE `key` = 'data_database_connection_string'", + ) + .fetch_optional(&pool) + .await + .map_err(|e| extenddb_storage::diagnostics::DiagError::QueryFailed(e.to_string()))?; + + let name_row: Option<(String,)> = + sqlx::query_as("SELECT value FROM settings WHERE `key` = 'data_database_name'") + .fetch_optional(&pool) + .await + .map_err(|e| { + extenddb_storage::diagnostics::DiagError::QueryFailed(e.to_string()) + })?; + + match (conn_row, name_row) { + (Some((conn,)), Some((name,))) => { + // Test connection + sqlx::mysql::MySqlPoolOptions::new() + .max_connections(1) + .connect(&conn) + .await + .map_err(|e| { + extenddb_storage::diagnostics::DiagError::ConnectionFailed( + e.to_string(), + ) + })?; + Ok(name) + } + _ => Err(extenddb_storage::diagnostics::DiagError::QueryFailed( + "Data database not configured".to_string(), + )), + } + }) + } +} + +// ── MetricsStore ─────────────────────────────────────────────────────── + +impl extenddb_storage::management_store::MetricsStore for TidbCatalogStore { + fn insert_metrics(&self, rows: &[MetricsRow]) -> BoxFuture<'_, OpResult<()>> { + let rows = rows.to_vec(); + Box::pin(async move { + for row in &rows { + let result = sqlx::query( + "INSERT INTO metrics \ + (bucket, metric, table_name, index_name, operation, sum, count, min, max) \ + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) \ + ON DUPLICATE KEY UPDATE sum = metrics.sum + VALUES(sum), \ + count = metrics.count + VALUES(count), \ + min = LEAST(metrics.min, VALUES(min)), \ + max = GREATEST(metrics.max, VALUES(max))", + ) + .bind(row.bucket) + .bind(&row.metric) + .bind(row.table_name.as_deref().unwrap_or("")) + .bind(row.index_name.as_deref().unwrap_or("")) + .bind(row.operation.as_deref().unwrap_or("")) + .bind(row.sum) + .bind(row.count) + .bind(row.min) + .bind(row.max) + .execute(&self.pool) + .await; + if let Err(e) = result { + tracing::warn!("Failed to upsert metrics row: {e}"); + } + } + Ok(()) + }) + } + + fn query_metrics( + &self, + start: time::OffsetDateTime, + end: time::OffsetDateTime, + table_name: Option<&str>, + metric: Option<&str>, + ) -> BoxFuture<'_, OpResult>> { + let table_name = table_name.map(|s| s.to_owned()); + let metric = metric.map(|s| s.to_owned()); + Box::pin(async move { + let mut sql = String::from( + "SELECT bucket, metric, table_name, index_name, operation, \ + sum, count, min, max \ + FROM metrics WHERE bucket >= ? AND bucket <= ?", + ); + + let table_filter = table_name.as_deref().filter(|s| !s.is_empty()); + if table_filter.is_some() { + sql.push_str(" AND table_name = ?"); + } + if metric.is_some() { + sql.push_str(" AND metric = ?"); + } + sql.push_str(" ORDER BY bucket"); + + // Build the query with dynamic binds. + let mut query = sqlx::query_as::<_, DbMetricsRow>(&sql) + .bind(start) + .bind(end); + if let Some(tn) = table_filter { + query = query.bind(tn); + } + if let Some(mn) = metric.as_deref() { + query = query.bind(mn); + } + + let rows = query.fetch_all(&self.pool).await.map_err(|e| { + tracing::warn!("query_metrics: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + Ok(rows + .into_iter() + .map(|r| MetricsRow { + bucket: r.bucket, + metric: r.metric, + table_name: if r.table_name.is_empty() { + None + } else { + Some(r.table_name) + }, + index_name: if r.index_name.is_empty() { + None + } else { + Some(r.index_name) + }, + operation: if r.operation.is_empty() { + None + } else { + Some(r.operation) + }, + sum: r.sum, + count: r.count, + min: r.min, + max: r.max, + }) + .collect()) + }) + } + + fn prune_metrics(&self, _retention: std::time::Duration) -> BoxFuture<'_, OpResult<()>> { + Box::pin(async move { + // TiDB native TTL owns fixed 24-hour metrics retention. + Ok(()) + }) + } +} + +/// Internal row type for `sqlx::FromRow` derivation. +#[derive(sqlx::FromRow)] +struct DbMetricsRow { + bucket: time::OffsetDateTime, + metric: String, + table_name: String, + index_name: String, + operation: String, + sum: f64, + count: i64, + min: f64, + max: f64, +} + +// ── RateLimitStore ───────────────────────────────────────────────────── + +impl extenddb_storage::management_store::RateLimitStore for TidbCatalogStore { + fn count_principal_failures( + &self, + principal: &str, + window_seconds: i64, + ) -> BoxFuture<'_, OpResult> { + let principal = principal.to_owned(); + Box::pin(async move { + let row: (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM login_attempts \ + WHERE principal = ? AND success = false \ + AND attempted_at > DATE_SUB(CURRENT_TIMESTAMP(6), INTERVAL ? SECOND)", + ) + .bind(&principal) + .bind(window_seconds) + .fetch_one(&self.pool) + .await + .map_err(|e| { + tracing::error!("count_principal_failures: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(row.0) + }) + } + + fn count_ip_failures( + &self, + source_ip: &str, + window_seconds: i64, + ) -> BoxFuture<'_, OpResult> { + let source_ip = source_ip.to_owned(); + Box::pin(async move { + let row: (i64,) = sqlx::query_as( + "SELECT COUNT(*) FROM login_attempts \ + WHERE source_ip = ? AND success = false \ + AND attempted_at > DATE_SUB(CURRENT_TIMESTAMP(6), INTERVAL ? SECOND)", + ) + .bind(&source_ip) + .bind(window_seconds) + .fetch_one(&self.pool) + .await + .map_err(|e| { + tracing::error!("count_ip_failures: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(row.0) + }) + } + + fn record_failed_login(&self, principal: &str, source_ip: Option<&str>) -> BoxFuture<'_, ()> { + let principal = principal.to_owned(); + let source_ip = source_ip.map(|s| s.to_owned()); + Box::pin(async move { + let result = sqlx::query( + "INSERT INTO login_attempts (principal, success, source_ip) VALUES (?, false, ?)", + ) + .bind(&principal) + .bind(source_ip.as_deref()) + .execute(&self.pool) + .await; + if let Err(e) = result { + tracing::error!("Failed to record login attempt: {e}"); + } + }) + } + + fn cleanup_old_attempts(&self, _max_age_seconds: i64) -> BoxFuture<'_, ()> { + Box::pin(async move { + // TiDB native TTL owns fixed 24-hour login-attempt retention. + }) + } +} + +// Implement CatalogStore supertrait +impl extenddb_storage::CatalogStore for TidbCatalogStore { + fn cached_encryption_key(&self) -> Option { + self.encryption_key.as_ref().map(|arc| arc.to_string()) + } +} diff --git a/crates/storage-tidb/src/config.rs b/crates/storage-tidb/src/config.rs new file mode 100755 index 0000000..28ebd88 --- /dev/null +++ b/crates/storage-tidb/src/config.rs @@ -0,0 +1,257 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! TiDB connection configuration. + +use percent_encoding::{NON_ALPHANUMERIC, percent_decode_str, utf8_percent_encode}; +use serde::Deserialize; +use url::Url; + +#[derive(Debug, Clone, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct TidbStorageConfig { + #[serde(default = "default_connection_string")] + pub connection_string: String, + #[serde(default = "default_pool_size")] + pub pool_size: u32, + /// Maximum connections for the management/catalog pool (authz, IAM, console). + /// Defaults to `pool_size` if not set. + #[serde(default)] + pub catalog_pool_size: Option, + #[serde(default)] + pub backup: TidbBackupConfig, +} + +#[derive(Debug, Clone, Default, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct TidbBackupConfig { + /// Backup executable. Defaults to `tiup`; use `br` with `component = ""` + /// when the BR binary is installed directly. + #[serde(default)] + pub binary: Option, + /// Optional component/subcommand after `binary`. Defaults to `br`. + #[serde(default)] + pub component: Option, + /// PD endpoint passed to BR, for example `127.0.0.1:2379`. + #[serde(default)] + pub pd_endpoint: Option, + /// Base external storage URI for snapshot backups. + #[serde(default)] + pub storage_uri: Option, + /// Base external storage URI for log backup / PITR. + #[serde(default)] + pub log_storage_uri: Option, + /// Maps to BR's `--send-credentials-to-tikv` flag. + #[serde(default)] + pub send_credentials_to_tikv: Option, +} + +impl Default for TidbStorageConfig { + fn default() -> Self { + Self { + connection_string: default_connection_string(), + pool_size: default_pool_size(), + catalog_pool_size: None, + backup: TidbBackupConfig::default(), + } + } +} + +fn default_connection_string() -> String { + "mysql://extenddb:extenddb-local-dev@localhost:4000/extenddb_catalog".to_owned() +} + +fn default_pool_size() -> u32 { + 20 +} + +/// Parsed components of a `TiDB` connection string. +pub struct ConnParts { + pub user: String, + pub password: String, + pub host: String, + pub port: u16, + pub database: String, +} + +fn decode_url_component(value: &str, label: &str) -> anyhow::Result { + percent_decode_str(value) + .decode_utf8() + .map(|decoded| decoded.into_owned()) + .map_err(|e| anyhow::anyhow!("Invalid percent-encoding in {label}: {e}")) +} + +fn encode_url_component(value: &str) -> String { + utf8_percent_encode(value, NON_ALPHANUMERIC).to_string() +} + +pub(crate) fn connection_url( + user: &str, + password: &str, + host: &str, + port: u16, + database: &str, +) -> String { + format!( + "mysql://{}:{}@{}:{}/{}", + encode_url_component(user), + encode_url_component(password), + host, + port, + encode_url_component(database), + ) +} + +/// Parse host, port, user, password, and database from a `TiDB` connection string. +/// +/// Handles the standard `mysql://user:pass@host:port/db` format used by TiDB's +/// MySQL-compatible wire protocol. `tidb://` is accepted for CLI parsing, but +/// generated configs use `mysql://` so `sqlx` can connect directly. +/// +/// # Errors +/// +/// Returns an error if the connection string doesn't match the expected format. +pub fn parse_connection_string(conn: &str) -> anyhow::Result { + let normalized = sqlx_connection_string(conn); + let url = Url::parse(&normalized) + .map_err(|e| anyhow::anyhow!("Invalid TiDB connection string: {e}"))?; + + if url.scheme() != "mysql" { + return Err(anyhow::anyhow!( + "Connection string must start with mysql:// or tidb://" + )); + } + + let host = url + .host_str() + .ok_or_else(|| anyhow::anyhow!("Connection string missing host"))? + .to_owned(); + let port = url + .port() + .ok_or_else(|| anyhow::anyhow!("Connection string missing :port"))?; + let database = url.path().strip_prefix('/').unwrap_or(url.path()); + if database.is_empty() { + return Err(anyhow::anyhow!("Connection string missing /database")); + } + if database.contains('/') { + return Err(anyhow::anyhow!("Database name must not contain '/'")); + } + + let user = decode_url_component(url.username(), "username")?; + let password = decode_url_component(url.password().unwrap_or_default(), "password")?; + let database = decode_url_component(database, "database")?; + + Ok(ConnParts { + user, + password, + host, + port, + database, + }) +} + +/// Convert a TiDB-friendly URL into the `mysql://` URL scheme expected by sqlx. +pub fn sqlx_connection_string(conn: &str) -> String { + conn.strip_prefix("tidb://") + .map_or_else(|| conn.to_owned(), |rest| format!("mysql://{rest}")) +} + +/// Redact the password in a TiDB/MySQL connection string without exposing +/// credentials that contain `@` or `:` characters. +pub fn redact_connection_string(conn: &str) -> String { + let uses_tidb_scheme = conn.starts_with("tidb://"); + let normalized = sqlx_connection_string(conn); + if let Ok(mut url) = Url::parse(&normalized) { + if url.password().is_some() { + let _ = url.set_password(Some("***")); + } + let redacted = url.to_string(); + return if uses_tidb_scheme { + redacted + .strip_prefix("mysql://") + .map_or(redacted.clone(), |rest| format!("tidb://{rest}")) + } else { + redacted + }; + } + + let Some(scheme_end) = conn.find("://").map(|i| i + 3) else { + return conn.to_owned(); + }; + let Some(at) = conn.rfind('@') else { + return conn.to_owned(); + }; + let Some(colon) = conn[scheme_end..at].rfind(':').map(|i| scheme_end + i) else { + return conn.to_owned(); + }; + format!("{}:***@{}", &conn[..colon], &conn[at + 1..]) +} + +#[cfg(test)] +mod tests { + use super::{connection_url, parse_connection_string, redact_connection_string}; + + #[test] + fn parses_percent_encoded_credentials() { + let parts = + parse_connection_string("mysql://extend%40db:p%40ss%2Fword@localhost:4000/db%2D1") + .expect("connection string should parse"); + assert_eq!(parts.user, "extend@db"); + assert_eq!(parts.password, "p@ss/word"); + assert_eq!(parts.host, "localhost"); + assert_eq!(parts.port, 4000); + assert_eq!(parts.database, "db-1"); + } + + #[test] + fn generated_urls_round_trip_special_credentials() { + let url = connection_url( + "extend@db", + "p@ss/word:#", + "127.0.0.1", + 4000, + "extenddb_data", + ); + let parts = parse_connection_string(&url).expect("generated URL should parse"); + assert_eq!(parts.user, "extend@db"); + assert_eq!(parts.password, "p@ss/word:#"); + assert_eq!(parts.database, "extenddb_data"); + } + + #[test] + fn redaction_uses_the_last_userinfo_separator() { + let redacted = redact_connection_string("mysql://extenddb:p@ss@localhost:4000/db"); + assert_eq!(redacted, "mysql://extenddb:***@localhost:4000/db"); + } +} + +// ── StorageConfig trait implementation ──────────────────────────────── + +impl extenddb_storage::config::StorageConfig for TidbStorageConfig { + fn connection_config(&self) -> &str { + &self.connection_string + } + + fn max_connections(&self) -> u32 { + self.pool_size + } + + fn max_catalog_connections(&self) -> u32 { + self.catalog_pool_size.unwrap_or(self.pool_size) + } + + fn native_backup_config(&self) -> Option { + Some(extenddb_storage::config::NativeBackupConfig { + binary: self.backup.binary.clone(), + component: self.backup.component.clone(), + coordinator_endpoint: self.backup.pd_endpoint.clone(), + storage_uri: self.backup.storage_uri.clone(), + log_storage_uri: self.backup.log_storage_uri.clone(), + send_credentials_to_storage_nodes: self.backup.send_credentials_to_tikv, + }) + } + + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } +} diff --git a/crates/storage-tidb/src/create_table.rs b/crates/storage-tidb/src/create_table.rs new file mode 100755 index 0000000..e644a8a --- /dev/null +++ b/crates/storage-tidb/src/create_table.rs @@ -0,0 +1,337 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `create_table` implementation for `TidbEngine`. + +use extenddb_core::types::{ + BillingMode, BillingModeSummary, CreateTableInput, GsiDescription, GsiInput, KeySchemaElement, + LsiDescription, LsiInput, Projection, ProvisionedThroughputDescription, TableDescription, + TableStatus, +}; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::{index_arn, stream_arn, table_arn}; + +use crate::TidbEngine; +use crate::throughput::provisioned_throughput_description; +use crate::tidb_util::is_unique_violation; + +#[derive(Clone, Copy, Eq, PartialEq)] +pub(crate) enum CreateTableActivation { + Standard, + Deferred, +} + +enum SecondaryIndexCreateRef<'a> { + Global(&'a GsiInput), + Local(&'a LsiInput), +} + +impl SecondaryIndexCreateRef<'_> { + fn api_type(&self) -> &'static str { + match self { + Self::Global(_) => "GSI", + Self::Local(_) => "LSI", + } + } + + fn index_name(&self) -> &str { + match self { + Self::Global(index) => &index.index_name, + Self::Local(index) => &index.index_name, + } + } + + fn key_schema(&self) -> &[KeySchemaElement] { + match self { + Self::Global(index) => &index.key_schema, + Self::Local(index) => &index.key_schema, + } + } + + fn projection(&self) -> &Projection { + match self { + Self::Global(index) => &index.projection, + Self::Local(index) => &index.projection, + } + } + + fn provisioned_throughput_description(&self) -> Option { + match self { + Self::Global(index) => index + .provisioned_throughput + .as_ref() + .map(provisioned_throughput_description), + Self::Local(_) => None, + } + } +} + +impl TidbEngine { + pub(crate) async fn create_table_impl( + &self, + account_id: &str, + input: CreateTableInput, + ) -> Result { + self.create_table_impl_with_activation(account_id, input, CreateTableActivation::Standard) + .await + } + + /// Core implementation of `create_table`. + pub(crate) async fn create_table_impl_with_activation( + &self, + account_id: &str, + input: CreateTableInput, + activation: CreateTableActivation, + ) -> Result { + Self::validate_account_id(account_id)?; + let table_id = uuid::Uuid::new_v4().to_string(); + let table_arn = table_arn(&self.region, account_id, &input.table_name); + let billing_mode = input.billing_mode.unwrap_or(BillingMode::Provisioned); + let key_schema_json = serde_json::to_value(&input.key_schema) + .map_err(|e| StorageError::Internal(e.to_string()))?; + let attr_defs_json = serde_json::to_value(&input.attribute_definitions) + .map_err(|e| StorageError::Internal(e.to_string()))?; + let billing_str = match billing_mode { + BillingMode::Provisioned => "PROVISIONED", + BillingMode::PayPerRequest => "PAY_PER_REQUEST", + }; + let pt_json = input + .provisioned_throughput + .as_ref() + .map(serde_json::to_value) + .transpose() + .map_err(|e| StorageError::Internal(e.to_string()))?; + let stream_json = input + .stream_specification + .as_ref() + .map(serde_json::to_value) + .transpose() + .map_err(|e| StorageError::Internal(e.to_string()))?; + let deletion_protection = input.deletion_protection_enabled.unwrap_or(false); + + let mut tx = self + .pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + // Insert catalog metadata as CREATING first. TiDB data DDL is owned by + // the control-plane reconciler, so a crash after this commit leaves a + // durable, retryable transition instead of a half-finished table. + let delay_secs: f64 = sqlx::query_scalar( + "SELECT COALESCE((SELECT CAST(value AS DOUBLE) FROM settings WHERE `key` = 'control_plane_delay_seconds'), 0.25)", + ) + .fetch_one(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + let now = time::OffsetDateTime::now_utc(); + let creation_epoch = + now.unix_timestamp() as f64 + f64::from(now.nanosecond()) / 1_000_000_000.0; + let stream_label = input + .stream_specification + .as_ref() + .is_some_and(|s| s.stream_enabled) + .then(Self::new_stream_label); + + sqlx::query( + r"INSERT INTO tables + (account_id, table_name, key_schema, attribute_definitions, billing_mode, + provisioned_throughput, stream_specification, table_status, + creation_date_time, table_arn, table_id, deletion_protection_enabled, + status_transition_at, stream_label) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP(6), ?, ?, ?, + CASE WHEN ? THEN DATE_ADD(CURRENT_TIMESTAMP(6), INTERVAL ? SECOND) ELSE NULL END, + ?)", + ) + .bind(account_id) + .bind(&input.table_name) + .bind(&key_schema_json) + .bind(&attr_defs_json) + .bind(billing_str) + .bind(&pt_json) + .bind(&stream_json) + .bind("CREATING") + .bind(&table_arn) + .bind(&table_id) + .bind(deletion_protection) + .bind(activation == CreateTableActivation::Standard) + .bind(delay_secs.max(0.0)) + .bind(&stream_label) + .execute(&mut *tx) + .await + .map_err(|e| { + if is_unique_violation(&e) { + StorageError::TableAlreadyExists(input.table_name.clone()) + } else { + StorageError::Internal(e.to_string()) + } + })?; + + // TiDB has one physical secondary-index mechanism. The GSI/LSI split is + // DynamoDB API metadata, not a separate storage path. + let global_indexes = input + .global_secondary_indexes + .iter() + .flatten() + .map(SecondaryIndexCreateRef::Global); + let local_indexes = input + .local_secondary_indexes + .iter() + .flatten() + .map(SecondaryIndexCreateRef::Local); + for index in global_indexes.chain(local_indexes) { + let key_schema_json = serde_json::to_value(index.key_schema()) + .map_err(|e| StorageError::Internal(e.to_string()))?; + let projection_json = serde_json::to_value(index.projection()) + .map_err(|e| StorageError::Internal(e.to_string()))?; + let provisioned_throughput = index.provisioned_throughput_description(); + let provisioned_throughput_json = provisioned_throughput + .as_ref() + .map(serde_json::to_value) + .transpose() + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let index_id = uuid::Uuid::new_v4().to_string(); + sqlx::query( + r"INSERT INTO indexes + (table_id, index_name, index_id, index_type, key_schema, projection, + index_status, provisioned_throughput) + VALUES (?, ?, ?, ?, ?, ?, 'ACTIVE', ?)", + ) + .bind(&table_id) + .bind(index.index_name()) + .bind(&index_id) + .bind(index.api_type()) + .bind(&key_schema_json) + .bind(&projection_json) + .bind(&provisioned_throughput_json) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + // Insert tags + if let Some(tags) = &input.tags { + for tag in tags { + sqlx::query("INSERT INTO tags (resource_arn, tag_key, tag_value) VALUES (?, ?, ?)") + .bind(&table_arn) + .bind(&tag.key) + .bind(&tag.value) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + } + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let response_status = TableStatus::Creating; + + // Wake the control plane poller so it processes the CREATING → ACTIVE + // transition without waiting for the idle timeout. + // If the server crashes between commit and notify, the startup recovery + // and defensive sweep recover the transition. + if activation == CreateTableActivation::Standard { + self.control_plane_notify.notify_one(); + } + + // Build response from in-scope data — avoids post-commit read race + // (another request could delete the table between commit and read). + let (rcu, wcu) = input.provisioned_throughput.as_ref().map_or((0, 0), |pt| { + (pt.read_capacity_units, pt.write_capacity_units) + }); + + let gsis = input.global_secondary_indexes.as_ref().map(|gs| { + gs.iter() + .map(|g| GsiDescription { + index_name: g.index_name.clone(), + key_schema: g.key_schema.clone(), + projection: g.projection.clone(), + index_status: "ACTIVE".to_owned(), + provisioned_throughput: Some(ProvisionedThroughputDescription { + read_capacity_units: g + .provisioned_throughput + .as_ref() + .map_or(0, |pt| pt.read_capacity_units), + write_capacity_units: g + .provisioned_throughput + .as_ref() + .map_or(0, |pt| pt.write_capacity_units), + number_of_decreases_today: 0, + last_increase_date_time: None, + last_decrease_date_time: None, + }), + index_size_bytes: 0, + item_count: 0, + index_arn: index_arn( + &self.region, + account_id, + &input.table_name, + &g.index_name, + ), + }) + .collect() + }); + + let lsis = input.local_secondary_indexes.as_ref().map(|ls| { + ls.iter() + .map(|l| LsiDescription { + index_name: l.index_name.clone(), + key_schema: l.key_schema.clone(), + projection: l.projection.clone(), + index_size_bytes: 0, + item_count: 0, + index_arn: index_arn( + &self.region, + account_id, + &input.table_name, + &l.index_name, + ), + }) + .collect() + }); + + let billing_mode_summary = if billing_mode == BillingMode::PayPerRequest { + Some(BillingModeSummary { + billing_mode: BillingMode::PayPerRequest, + last_update_to_pay_per_request_date_time: Some(creation_epoch), + }) + } else { + None + }; + + let latest_stream_arn = stream_label + .as_ref() + .map(|label| stream_arn(&self.region, account_id, &input.table_name, label)); + + Ok(TableDescription { + table_name: input.table_name, + key_schema: input.key_schema, + attribute_definitions: input.attribute_definitions, + table_status: response_status, + creation_date_time: creation_epoch, + table_size_bytes: 0, + item_count: 0, + table_arn, + table_id, + provisioned_throughput: ProvisionedThroughputDescription { + read_capacity_units: rcu, + write_capacity_units: wcu, + number_of_decreases_today: 0, + last_increase_date_time: None, + last_decrease_date_time: None, + }, + billing_mode_summary, + global_secondary_indexes: gsis, + local_secondary_indexes: lsis, + stream_specification: input.stream_specification, + latest_stream_arn, + latest_stream_label: stream_label, + deletion_protection_enabled: input.deletion_protection_enabled.unwrap_or(false), + sse_description: None, + table_class_summary: None, + }) + } +} diff --git a/crates/storage-tidb/src/credential_store.rs b/crates/storage-tidb/src/credential_store.rs new file mode 100755 index 0000000..921f951 --- /dev/null +++ b/crates/storage-tidb/src/credential_store.rs @@ -0,0 +1,200 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Database-backed credential store for SigV4 authentication. +//! +//! Implements `extenddb_auth::CredentialStore` by looking up access keys and +//! session credentials from the catalog database, decrypting secrets with +//! AES-256-GCM. + +use extenddb_auth::{CredentialStore, StoredCredential}; +use extenddb_core::error::DynamoDbError; +use sqlx::MySqlPool; +use zeroize::{Zeroize, ZeroizeOnDrop}; + +/// Decrypt a secret key from `nonce || ciphertext` using the base64-encoded encryption key. +/// +/// `aad` must match the value used during encryption (CB-11). Falls back to +/// decryption without AAD for secrets encrypted before the CB-11 fix. +fn decrypt_secret(encrypted: &[u8], key_b64: &str, aad: &str) -> Result { + use aes_gcm::Aes256Gcm; + use aes_gcm::KeyInit; + use aes_gcm::aead::Aead; + use aes_gcm::aead::Payload; + use base64::Engine; + + if encrypted.len() < 28 { + return Err( + "ciphertext too short (need at least 12-byte nonce + 16-byte auth tag)".to_owned(), + ); + } + + let key_bytes = base64::engine::general_purpose::STANDARD + .decode(key_b64) + .map_err(|e| format!("decode encryption key: {e}"))?; + + let key = aes_gcm::Key::::from_slice(&key_bytes); + let cipher = Aes256Gcm::new(key); + let nonce = aes_gcm::Nonce::from_slice(&encrypted[..12]); + + // Try with AAD first (CB-11 format). + let payload_with_aad = Payload { + msg: &encrypted[12..], + aad: aad.as_bytes(), + }; + if let Ok(plaintext_bytes) = cipher.decrypt(nonce, payload_with_aad) { + return String::from_utf8(plaintext_bytes) + .map_err(|e| format!("decrypted secret is not valid UTF-8: {e}")); + } + + // Fall back to without AAD (pre-CB-11 format). + tracing::debug!("Decrypting secret without AAD (pre-CB-11 format) for {aad}"); + let plaintext_bytes = cipher + .decrypt(nonce, &encrypted[12..]) + .map_err(|e| format!("decrypt: {e}"))?; + + String::from_utf8(plaintext_bytes) + .map_err(|e| format!("decrypted secret is not valid UTF-8: {e}")) +} + +/// Credential store backed by the catalog TiDB database. +/// +/// The `encryption_key` is zeroed from memory on drop. +#[derive(Zeroize, ZeroizeOnDrop)] +pub struct DbCredentialStore { + #[zeroize(skip)] + pool: MySqlPool, + /// Base64-encoded AES-256-GCM encryption key from the settings table. + encryption_key: String, +} + +impl DbCredentialStore { + /// Create a new credential store. + /// + /// `encryption_key` is the base64-encoded 32-byte key from the `settings` table. + pub fn new(pool: MySqlPool, encryption_key: String) -> Self { + Self { + pool, + encryption_key, + } + } +} + +#[async_trait::async_trait] +impl CredentialStore for DbCredentialStore { + async fn lookup_credential( + &self, + access_key_id: &str, + ) -> Result, DynamoDbError> { + // Try long-lived access key first (AKIA*). + if access_key_id.starts_with("AKIA") { + return self.lookup_user_credential(access_key_id).await; + } + + // Try session credential (ASIA*). + if access_key_id.starts_with("ASIA") { + return self.lookup_session_credential(access_key_id).await; + } + + // S-4: Normalize error for all unrecognized access key prefixes. + Ok(None) + } +} + +impl DbCredentialStore { + async fn lookup_user_credential( + &self, + access_key_id: &str, + ) -> Result, DynamoDbError> { + let row: Option<(Vec, String, String, bool)> = sqlx::query_as( + "SELECT secret_key_encrypted, account_id, user_name, is_active \ + FROM access_keys WHERE access_key_id = ?", + ) + .bind(access_key_id) + .fetch_optional(&self.pool) + .await + .map_err(|e| { + tracing::error!("Credential lookup failed for access key {access_key_id}: {e}"); + DynamoDbError::InternalServerError("Internal error during authentication".to_owned()) + })?; + + let Some((encrypted, account_id, user_name, is_active)) = row else { + return Ok(None); + }; + + let secret_key = + decrypt_secret(&encrypted, &self.encryption_key, access_key_id).map_err(|e| { + tracing::error!("Secret key decryption failed for access key {access_key_id}: {e}"); + DynamoDbError::InternalServerError( + "Internal error during authentication".to_owned(), + ) + })?; + + Ok(Some(StoredCredential { + secret_key, + account_id, + principal_name: user_name, + session_name: None, + is_session: false, + session_token: None, + is_active, + })) + } + + async fn lookup_session_credential( + &self, + access_key_id: &str, + ) -> Result, DynamoDbError> { + let row: Option<( + Vec, + String, + String, + String, + String, + time::OffsetDateTime, + )> = sqlx::query_as( + "SELECT secret_key_encrypted, account_id, role_name, session_name, \ + session_token, expires_at \ + FROM iam_sessions WHERE access_key_id = ?", + ) + .bind(access_key_id) + .fetch_optional(&self.pool) + .await + .map_err(|e| { + tracing::error!("Session credential lookup failed for access key {access_key_id}: {e}"); + DynamoDbError::InternalServerError("Internal error during authentication".to_owned()) + })?; + + let Some((encrypted, account_id, role_name, session_name, session_token, expires_at)) = row + else { + return Ok(None); + }; + + // CB-12: Fail-closed on expired sessions. + if expires_at < time::OffsetDateTime::now_utc() { + return Err(DynamoDbError::ExpiredTokenException( + "The security token included in the request is expired".to_owned(), + )); + } + + let secret_key = + decrypt_secret(&encrypted, &self.encryption_key, access_key_id).map_err(|e| { + tracing::error!( + "Session secret key decryption failed for access key {access_key_id}: {e}" + ); + DynamoDbError::InternalServerError( + "Internal error during authentication".to_owned(), + ) + })?; + + Ok(Some(StoredCredential { + secret_key, + account_id, + principal_name: role_name, + session_name: Some(session_name), + is_session: true, + session_token: Some(session_token), + is_active: true, + })) + } +} diff --git a/crates/storage-tidb/src/data/data_engine.rs b/crates/storage-tidb/src/data/data_engine.rs new file mode 100755 index 0000000..2bd9347 --- /dev/null +++ b/crates/storage-tidb/src/data/data_engine.rs @@ -0,0 +1,384 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Thin `DataEngine` trait implementation that delegates to `impl TidbEngine` +//! methods in sibling modules. + +use extenddb_core::expression::{Expr, ExpressionMaps, KeyCondition, UpdateAction}; +use extenddb_core::types::{Item, ReturnValuesOnConditionCheckFailure, TableKeyInfo}; +use extenddb_storage::error::StorageError; +use extenddb_storage::{DataEngine, StreamCapture, TransactGetOp, TransactWriteOp}; +use futures::future::BoxFuture; + +use crate::TidbEngine; + +enum OwnedTransactWriteOp { + Put { + key_info: TableKeyInfo, + item: Item, + condition: Option, + maps: ExpressionMaps, + return_values_on_ccf: ReturnValuesOnConditionCheckFailure, + stream: Option, + }, + Delete { + key_info: TableKeyInfo, + key: Item, + condition: Option, + maps: ExpressionMaps, + return_values_on_ccf: ReturnValuesOnConditionCheckFailure, + stream: Option, + }, + Update { + key_info: TableKeyInfo, + key: Item, + actions: Vec, + condition: Option, + maps: ExpressionMaps, + return_values_on_ccf: ReturnValuesOnConditionCheckFailure, + stream: Option, + }, + ConditionCheck { + key_info: TableKeyInfo, + key: Item, + condition: Expr, + maps: ExpressionMaps, + return_values_on_ccf: ReturnValuesOnConditionCheckFailure, + }, +} + +impl OwnedTransactWriteOp { + fn from_borrowed(op: &TransactWriteOp<'_>) -> Self { + match op { + TransactWriteOp::Put { + key_info, + item, + condition, + maps, + return_values_on_ccf, + stream, + } => Self::Put { + key_info: (*key_info).clone(), + item: (*item).clone(), + condition: condition.cloned(), + maps: (*maps).clone(), + return_values_on_ccf: *return_values_on_ccf, + stream: stream.clone(), + }, + TransactWriteOp::Delete { + key_info, + key, + condition, + maps, + return_values_on_ccf, + stream, + } => Self::Delete { + key_info: (*key_info).clone(), + key: (*key).clone(), + condition: condition.cloned(), + maps: (*maps).clone(), + return_values_on_ccf: *return_values_on_ccf, + stream: stream.clone(), + }, + TransactWriteOp::Update { + key_info, + key, + actions, + condition, + maps, + return_values_on_ccf, + stream, + } => Self::Update { + key_info: (*key_info).clone(), + key: (*key).clone(), + actions: actions.to_vec(), + condition: condition.cloned(), + maps: (*maps).clone(), + return_values_on_ccf: *return_values_on_ccf, + stream: stream.clone(), + }, + TransactWriteOp::ConditionCheck { + key_info, + key, + condition, + maps, + return_values_on_ccf, + } => Self::ConditionCheck { + key_info: (*key_info).clone(), + key: (*key).clone(), + condition: (*condition).clone(), + maps: (*maps).clone(), + return_values_on_ccf: *return_values_on_ccf, + }, + } + } + + fn as_borrowed(&self) -> TransactWriteOp<'_> { + match self { + Self::Put { + key_info, + item, + condition, + maps, + return_values_on_ccf, + stream, + } => TransactWriteOp::Put { + key_info, + item, + condition: condition.as_ref(), + maps, + return_values_on_ccf: *return_values_on_ccf, + stream: stream.clone(), + }, + Self::Delete { + key_info, + key, + condition, + maps, + return_values_on_ccf, + stream, + } => TransactWriteOp::Delete { + key_info, + key, + condition: condition.as_ref(), + maps, + return_values_on_ccf: *return_values_on_ccf, + stream: stream.clone(), + }, + Self::Update { + key_info, + key, + actions, + condition, + maps, + return_values_on_ccf, + stream, + } => TransactWriteOp::Update { + key_info, + key, + actions, + condition: condition.as_ref(), + maps, + return_values_on_ccf: *return_values_on_ccf, + stream: stream.clone(), + }, + Self::ConditionCheck { + key_info, + key, + condition, + maps, + return_values_on_ccf, + } => TransactWriteOp::ConditionCheck { + key_info, + key, + condition, + maps, + return_values_on_ccf: *return_values_on_ccf, + }, + } + } +} + +impl DataEngine for TidbEngine { + fn put_item( + &self, + key_info: &TableKeyInfo, + item: Item, + return_old: bool, + condition: Option<&Expr>, + maps: &ExpressionMaps, + stream: Option<&StreamCapture>, + ) -> BoxFuture<'_, Result, StorageError>> { + let key_info = key_info.clone(); + let condition = condition.cloned(); + let maps = maps.clone(); + let stream = stream.cloned(); + Box::pin(async move { + self.put_item_impl( + &key_info, + item, + return_old, + condition.as_ref(), + &maps, + stream.as_ref(), + ) + .await + }) + } + + fn get_item( + &self, + key_info: &TableKeyInfo, + key: &Item, + ) -> BoxFuture<'_, Result, StorageError>> { + let key_info = key_info.clone(); + let key = key.clone(); + Box::pin(async move { self.get_item_impl(&key_info, &key).await }) + } + + fn delete_item( + &self, + key_info: &TableKeyInfo, + key: &Item, + return_old: bool, + condition: Option<&Expr>, + maps: &ExpressionMaps, + stream: Option<&StreamCapture>, + ) -> BoxFuture<'_, Result, StorageError>> { + let key_info = key_info.clone(); + let key = key.clone(); + let condition = condition.cloned(); + let maps = maps.clone(); + let stream = stream.cloned(); + Box::pin(async move { + self.delete_item_impl( + &key_info, + &key, + return_old, + condition.as_ref(), + &maps, + stream.as_ref(), + ) + .await + }) + } + + fn update_item( + &self, + key_info: &TableKeyInfo, + key: &Item, + actions: &[UpdateAction], + return_old: bool, + return_new: bool, + condition: Option<&Expr>, + maps: &ExpressionMaps, + stream: Option<&StreamCapture>, + ) -> BoxFuture<'_, Result<(Option, Option), StorageError>> { + let key_info = key_info.clone(); + let key = key.clone(); + let actions = actions.to_vec(); + let condition = condition.cloned(); + let maps = maps.clone(); + let stream = stream.cloned(); + Box::pin(async move { + self.update_item_impl( + &key_info, + &key, + &actions, + return_old, + return_new, + condition.as_ref(), + &maps, + stream.as_ref(), + ) + .await + }) + } + + fn query( + &self, + key_info: &TableKeyInfo, + key_condition: &KeyCondition, + maps: &ExpressionMaps, + forward: bool, + limit: Option, + exclusive_start_key: Option<&Item>, + index_name: Option<&str>, + ) -> BoxFuture<'_, Result<(Vec, Option), StorageError>> { + let key_info = key_info.clone(); + let key_condition = key_condition.clone(); + let maps = maps.clone(); + let exclusive_start_key = exclusive_start_key.cloned(); + let index_name = index_name.map(|s| s.to_string()); + Box::pin(async move { + self.query_impl( + &key_info, + &key_condition, + &maps, + forward, + limit, + exclusive_start_key.as_ref(), + index_name.as_deref(), + ) + .await + }) + } + + fn scan( + &self, + key_info: &TableKeyInfo, + limit: Option, + exclusive_start_key: Option<&Item>, + segment: Option, + total_segments: Option, + index_name: Option<&str>, + ) -> BoxFuture<'_, Result<(Vec, Option), StorageError>> { + let key_info = key_info.clone(); + let exclusive_start_key = exclusive_start_key.cloned(); + let index_name = index_name.map(|s| s.to_string()); + Box::pin(async move { + self.scan_impl( + &key_info, + limit, + exclusive_start_key.as_ref(), + segment, + total_segments, + index_name.as_deref(), + ) + .await + }) + } + + fn transact_get_items( + &self, + ops: &[TransactGetOp<'_>], + ) -> BoxFuture<'_, Result>, StorageError>> { + // Clone ops to owned data to satisfy lifetime requirements + let owned_ops: Vec<_> = ops + .iter() + .map(|op| (op.key_info.clone(), op.key.clone())) + .collect(); + Box::pin(async move { + // Reconstruct borrowed ops from owned data + let borrowed_ops: Vec = owned_ops + .iter() + .map(|(key_info, key)| TransactGetOp { key_info, key }) + .collect(); + self.transact_get_items_impl(&borrowed_ops).await + }) + } + + fn transact_write_items( + &self, + ops: &[TransactWriteOp<'_>], + token: Option<(&str, &str)>, + ) -> BoxFuture<'_, Result<(), StorageError>> { + let owned_ops: Vec<_> = ops + .iter() + .map(OwnedTransactWriteOp::from_borrowed) + .collect(); + let token = token.map(|(a, b)| (a.to_string(), b.to_string())); + + Box::pin(async move { + let borrowed_ops: Vec = owned_ops + .iter() + .map(OwnedTransactWriteOp::as_borrowed) + .collect(); + self.transact_write_items_impl( + &borrowed_ops, + token.as_ref().map(|(a, b)| (a.as_str(), b.as_str())), + ) + .await + }) + } + + fn cleanup_expired_idempotency_tokens( + &self, + max_age_seconds: i64, + ) -> BoxFuture<'_, Result> { + Box::pin(async move { + self.cleanup_expired_idempotency_tokens_impl(max_age_seconds) + .await + }) + } +} diff --git a/crates/storage-tidb/src/data/ddl.rs b/crates/storage-tidb/src/data/ddl.rs new file mode 100755 index 0000000..5847bc7 --- /dev/null +++ b/crates/storage-tidb/src/data/ddl.rs @@ -0,0 +1,406 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! DDL helpers for creating and dropping per-DynamoDB-table data tables in `TiDB`. + +use extenddb_core::types::{ + AttributeDefinition, IndexInfo, IndexType, KeySchemaElement, Projection, StreamSpecification, + TableKeyInfo, +}; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::{sk_column, sk_column_n}; + +use super::index::{create_native_secondary_index, drop_native_secondary_index}; +use super::{all_sort_key_info, data_table_name}; +use crate::TidbEngine; + +/// Row shape returned by the table-info query: (key_schema, attr_defs, status, table_id, stream_spec, has_lsi). +type TableInfoRow = ( + serde_json::Value, + serde_json::Value, + String, + String, + Option, + Option, +); + +fn table_accepts_data_plane(status: &str) -> bool { + matches!(status, "ACTIVE" | "UPDATING") +} + +fn data_table_ddl( + table_id: &str, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> String { + let ddb_table = data_table_name(table_id); + let sk_infos = all_sort_key_info(key_schema, attr_defs); + + if sk_infos.is_empty() { + format!( + r"CREATE TABLE IF NOT EXISTS {ddb_table} ( + pk VARBINARY(2048) NOT NULL PRIMARY KEY CLUSTERED, + item_data JSON NOT NULL + )" + ) + } else if sk_infos.len() == 1 { + let sk_col = sk_column(sk_infos[0].1); + format!( + r"CREATE TABLE IF NOT EXISTS {ddb_table} ( + pk VARBINARY(2048) NOT NULL, + sk_s VARBINARY(1024), + sk_n DECIMAL(65, 30), + sk_b VARBINARY(1024), + item_data JSON NOT NULL, + PRIMARY KEY (pk, {sk_col}) CLUSTERED + )" + ) + } else { + let mut col_defs = vec!["pk VARBINARY(2048) NOT NULL".to_owned()]; + let mut pk_cols = vec!["pk".to_owned()]; + for (i, &(_, sk_type)) in sk_infos.iter().enumerate() { + let col = sk_column_n(i, sk_type); + if i == 0 { + col_defs.push("sk_s VARBINARY(1024)".to_owned()); + col_defs.push("sk_n DECIMAL(65, 30)".to_owned()); + col_defs.push("sk_b VARBINARY(1024)".to_owned()); + } else { + let n = i + 1; + col_defs.push(format!("sk{n}_s VARBINARY(1024)")); + col_defs.push(format!("sk{n}_n DECIMAL(65, 30)")); + col_defs.push(format!("sk{n}_b VARBINARY(1024)")); + } + pk_cols.push(col); + } + col_defs.push("item_data JSON NOT NULL".to_owned()); + format!( + "CREATE TABLE IF NOT EXISTS {ddb_table} (\n {},\n PRIMARY KEY ({}) CLUSTERED\n)", + col_defs.join(",\n "), + pk_cols.join(", ") + ) + } +} + +impl TidbEngine { + /// Create the per-DynamoDB-table data table in `TiDB`. + /// + /// The DDL is dynamically + /// generated based on the key schema — the primary key constraint uses + /// the sort key column matching the sort key's scalar type. + /// + /// # Errors + /// + /// Returns [`StorageError::Internal`] if the DDL execution fails. + /// + /// # Safety (SQL injection) + /// + /// Table names are validated at the engine layer to contain only `[a-zA-Z0-9_.-]`. + /// Column names are compile-time constants. No user input is interpolated + /// into the DDL beyond the validated table name. + pub(crate) async fn create_data_table( + pool: &sqlx::MySqlPool, + table_id: &str, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], + ) -> Result<(), StorageError> { + let ddl = data_table_ddl(table_id, key_schema, attr_defs); + + sqlx::query(&ddl) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) + } + + /// Drop the per-DynamoDB-table data table. + /// + /// Called when a table deletion transition completes. + /// + /// # Errors + /// + /// Returns [`StorageError::Internal`] if the DDL execution fails. + pub(crate) async fn drop_data_table( + pool: &sqlx::MySqlPool, + table_id: &str, + ) -> Result<(), StorageError> { + let ddb_table = data_table_name(table_id); + let ddl = format!("DROP TABLE IF EXISTS {ddb_table}"); + sqlx::query(&ddl) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + Ok(()) + } + + /// Create a DynamoDB secondary index as native TiDB generated columns plus + /// one native secondary index. + /// + /// TiDB has no separate local-index physical path; GSI versus LSI is + /// DynamoDB API/catalog metadata. + // S2: Parameters mirror the SQL schema dimensions (account, table, index, + // key schemas, attribute defs). A wrapper struct would obscure the call + // site without adding clarity. + #[allow(clippy::too_many_arguments)] + pub(crate) async fn create_index_artifacts( + pool: &sqlx::MySqlPool, + table_id: &str, + index_id: &str, + index_key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], + base_key_schema: &[KeySchemaElement], + base_attr_defs: &[AttributeDefinition], + ) -> Result<(), StorageError> { + create_native_secondary_index( + pool, + table_id, + index_id, + index_key_schema, + attr_defs, + base_key_schema, + base_attr_defs, + ) + .await + } + + /// Drop a native TiDB secondary index and its generated key columns. + pub(crate) async fn drop_index_artifacts( + pool: &sqlx::MySqlPool, + table_id: &str, + index_id: &str, + index_key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], + ) -> Result<(), StorageError> { + drop_native_secondary_index(pool, table_id, index_id, index_key_schema, attr_defs).await + } + + /// Fetch key schema and attribute definitions for a table from the catalog. + /// + /// Uses a single query that combines the table row with an LSI API-metadata + /// existence subquery for ItemCollectionMetrics. TiDB secondary indexes use + /// the same native physical path for GSI and LSI definitions. + /// + /// # Errors + /// + /// Returns [`StorageError::TableNotFound`] if the table doesn't exist. + /// Returns [`StorageError::TableNotActive`] if the table cannot serve data-plane requests. + /// Returns [`StorageError::Internal`] on query or deserialization failure. + pub(crate) async fn fetch_table_key_info( + &self, + account_id: &str, + table_name: &str, + ) -> Result { + let row: Option = sqlx::query_as( + "SELECT key_schema, attribute_definitions, table_status, table_id, \ + stream_specification, \ + EXISTS(SELECT 1 FROM indexes WHERE table_id = tables.table_id AND index_type = 'LSI') AS has_lsi \ + FROM tables \ + WHERE account_id = ? AND table_name = ?", + ) + .bind(account_id) + .bind(table_name) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (ks_json, ad_json, status, table_id, stream_spec_json, has_lsi) = + row.ok_or_else(|| StorageError::TableNotFound(table_name.to_owned()))?; + + if !table_accepts_data_plane(&status) { + return Err(StorageError::TableNotActive(table_name.to_owned())); + } + + let key_schema: Vec = + serde_json::from_value(ks_json).map_err(|e| StorageError::Internal(e.to_string()))?; + let attribute_definitions: Vec = + serde_json::from_value(ad_json).map_err(|e| StorageError::Internal(e.to_string()))?; + + let stream_specification: Option = stream_spec_json + .map(serde_json::from_value) + .transpose() + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(TableKeyInfo { + table_name: table_name.to_owned(), + account_id: account_id.to_owned(), + table_id, + key_schema, + attribute_definitions, + has_lsi: has_lsi.unwrap_or(false), + stream_specification, + }) + } + + /// Fetch metadata for a secondary index from the catalog. + /// + /// This variant looks up `table_id` from the tables catalog. Prefer + /// `fetch_index_info_by_table_id` when `TableKeyInfo` is already available + /// (P118 optimization #4). + pub(crate) async fn fetch_index_info( + &self, + account_id: &str, + table_name: &str, + index_name: &str, + ) -> Result { + // First get the table_id and verify the table can serve data-plane reads. + let row: Option<(String, String)> = sqlx::query_as( + "SELECT table_id, table_status FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(account_id) + .bind(table_name) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (table_id, status) = + row.ok_or_else(|| StorageError::TableNotFound(table_name.to_owned()))?; + + if !table_accepts_data_plane(&status) { + return Err(StorageError::TableNotActive(table_name.to_owned())); + } + + self.fetch_index_info_by_table_id(&table_id, index_name) + .await + } + + /// Fetch metadata for a secondary index using a known `table_id`. + /// + /// Saves one catalog roundtrip vs `fetch_index_info` when the caller + /// already has `TableKeyInfo` (P118 optimization #4). + pub(crate) async fn fetch_index_info_by_table_id( + &self, + table_id: &str, + index_name: &str, + ) -> Result { + let idx_row: Option<(String, String, serde_json::Value, serde_json::Value)> = + sqlx::query_as( + "SELECT index_type, index_id, key_schema, projection \ + FROM indexes \ + WHERE table_id = ? AND index_name = ? AND index_status = 'ACTIVE'", + ) + .bind(table_id) + .bind(index_name) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (idx_type_str, idx_id, ks_json, proj_json) = + idx_row.ok_or_else(|| StorageError::IndexNotFound(index_name.to_owned()))?; + + let index_type = match idx_type_str.as_str() { + "GSI" => IndexType::Gsi, + "LSI" => IndexType::Lsi, + other => { + return Err(StorageError::Internal(format!( + "unknown index type in database: {other}" + ))); + } + }; + + let key_schema: Vec = + serde_json::from_value(ks_json).map_err(|e| StorageError::Internal(e.to_string()))?; + let projection: Projection = + serde_json::from_value(proj_json).map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(IndexInfo { + index_name: index_name.to_owned(), + index_id: idx_id, + index_type, + key_schema, + projection, + }) + } + + pub(crate) async fn fetch_base_key_schema_by_table_id( + &self, + table_id: &str, + ) -> Result<(Vec, Vec), StorageError> { + let row: Option<(serde_json::Value, serde_json::Value, String)> = sqlx::query_as( + "SELECT key_schema, attribute_definitions, table_status \ + FROM tables WHERE table_id = ?", + ) + .bind(table_id) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (ks_json, ad_json, status) = + row.ok_or_else(|| StorageError::TableNotFound(table_id.to_owned()))?; + if !table_accepts_data_plane(&status) { + return Err(StorageError::TableNotActive(table_id.to_owned())); + } + + let key_schema = + serde_json::from_value(ks_json).map_err(|e| StorageError::Internal(e.to_string()))?; + let attr_defs = + serde_json::from_value(ad_json).map_err(|e| StorageError::Internal(e.to_string()))?; + Ok((key_schema, attr_defs)) + } +} + +#[cfg(test)] +mod tests { + use extenddb_core::types::{ + AttributeDefinition, KeySchemaElement, KeyType, ScalarAttributeType, + }; + + use super::data_table_ddl; + + fn attr(name: &str, ty: ScalarAttributeType) -> AttributeDefinition { + AttributeDefinition { + attribute_name: name.to_owned(), + attribute_type: ty, + } + } + + fn key(name: &str, key_type: KeyType) -> KeySchemaElement { + KeySchemaElement { + attribute_name: name.to_owned(), + key_type, + } + } + + #[test] + fn hash_only_tables_use_explicit_clustered_primary_key() { + let ddl = data_table_ddl( + "tableid", + &[key("pk", KeyType::Hash)], + &[attr("pk", ScalarAttributeType::S)], + ); + + assert!(ddl.contains("PRIMARY KEY CLUSTERED")); + } + + #[test] + fn range_key_tables_use_explicit_clustered_primary_key() { + let ddl = data_table_ddl( + "tableid", + &[key("pk", KeyType::Hash), key("sk", KeyType::Range)], + &[ + attr("pk", ScalarAttributeType::S), + attr("sk", ScalarAttributeType::S), + ], + ); + + assert!(ddl.contains("PRIMARY KEY (pk, sk_s) CLUSTERED")); + } + + #[test] + fn multipart_range_key_tables_use_explicit_clustered_primary_key() { + let ddl = data_table_ddl( + "tableid", + &[ + key("pk", KeyType::Hash), + key("sk", KeyType::Range), + key("sk2", KeyType::Range), + ], + &[ + attr("pk", ScalarAttributeType::S), + attr("sk", ScalarAttributeType::S), + attr("sk2", ScalarAttributeType::N), + ], + ); + + assert!(ddl.contains("PRIMARY KEY (pk, sk_s, sk2_n) CLUSTERED")); + } +} diff --git a/crates/storage-tidb/src/data/delete_item.rs b/crates/storage-tidb/src/data/delete_item.rs new file mode 100755 index 0000000..f026371 --- /dev/null +++ b/crates/storage-tidb/src/data/delete_item.rs @@ -0,0 +1,244 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `delete_item` implementation for the `TiDB` backend. + +use extenddb_core::expression::{Expr, ExpressionMaps}; +use extenddb_core::types::{Item, TableKeyInfo}; +use extenddb_storage::StreamCapture; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::{SortKeyValue, parse_sk, pk_to_text, sk_column, sk_info}; + +use super::query::check_condition; +use super::tx_helpers::write_stream_record_in_tx; +use super::{data_table_name, json_to_item}; +use crate::TidbEngine; + +impl TidbEngine { + /// Implementation of `DataEngine::delete_item`. + pub(crate) async fn delete_item_impl( + &self, + key_info: &TableKeyInfo, + key: &Item, + return_old: bool, + condition: Option<&Expr>, + maps: &ExpressionMaps, + stream: Option<&StreamCapture>, + ) -> Result, StorageError> { + let ddb_table = data_table_name(&key_info.table_id); + + let pk_name = &key_info.key_schema[0].attribute_name; + let pk_value = key + .get(pk_name) + .ok_or_else(|| StorageError::Internal("missing partition key".to_owned()))?; + let pk_text = pk_to_text(pk_value)?; + + let needs_tx = condition.is_some() || return_old || stream.is_some(); + + if let Some((sk_name, sk_type)) = + sk_info(&key_info.key_schema, &key_info.attribute_definitions) + { + let sk_value = key + .get(sk_name) + .ok_or_else(|| StorageError::Internal("missing sort key".to_owned()))?; + let sk = parse_sk(sk_value, sk_type)?; + let sk_col = sk_column(sk_type); + + if needs_tx { + let select_sql = format!( + "SELECT item_data FROM {ddb_table} WHERE pk = ? AND {sk_col} = ? FOR UPDATE" + ); + let delete_sql = format!("DELETE FROM {ddb_table} WHERE pk = ? AND {sk_col} = ?"); + + let mut tx = self + .data_pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let old: Option<(serde_json::Value,)> = + bind_sk_fetch_optional!(&select_sql, pk_text.as_ref(), &sk, &mut *tx)?; + + if let Some((ref old_json,)) = old { + let old_item: Item = json_to_item(old_json.clone())?; + match check_condition(condition, &old_item, maps) { + Ok(()) => {} + Err(StorageError::ConditionFailed(_)) => { + return Err(StorageError::ConditionFailed(Some(old_item))); + } + Err(e) => return Err(e), + } + } else { + // No existing item — condition checks against empty item + let empty = std::collections::BTreeMap::new(); + match check_condition(condition, &empty, maps) { + Ok(()) => {} + Err(StorageError::ConditionFailed(_)) => { + return Err(StorageError::ConditionFailed(None)); + } + Err(e) => return Err(e), + } + // Nothing to delete + return Ok(None); + } + + // Delete the row + match &sk { + SortKeyValue::S(s) => { + sqlx::query(&delete_sql) + .bind(pk_text.as_ref()) + .bind(s.as_bytes().to_vec()) + .execute(&mut *tx) + .await + } + SortKeyValue::N(n) => { + sqlx::query(&delete_sql) + .bind(pk_text.as_ref()) + .bind(n) + .execute(&mut *tx) + .await + } + SortKeyValue::B(b) => { + sqlx::query(&delete_sql) + .bind(pk_text.as_ref()) + .bind(b) + .execute(&mut *tx) + .await + } + } + .map_err(|e| StorageError::Internal(e.to_string()))?; + + // Write stream record atomically within the transaction. + if let Some(capture) = stream { + let old_for_stream = old + .as_ref() + .map(|(v,)| json_to_item(v.clone())) + .transpose()?; + write_stream_record_in_tx( + &mut tx, + key_info, + capture, + old_for_stream.as_ref(), + None, + ) + .await?; + } + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if return_old { + old.map(|(v,)| json_to_item(v)).transpose() + } else { + Ok(None) + } + } else { + let delete_sql = format!("DELETE FROM {ddb_table} WHERE pk = ? AND {sk_col} = ?"); + match &sk { + SortKeyValue::S(s) => { + sqlx::query(&delete_sql) + .bind(pk_text.as_ref()) + .bind(s.as_bytes().to_vec()) + .execute(&self.data_pool) + .await + } + SortKeyValue::N(n) => { + sqlx::query(&delete_sql) + .bind(pk_text.as_ref()) + .bind(n) + .execute(&self.data_pool) + .await + } + SortKeyValue::B(b) => { + sqlx::query(&delete_sql) + .bind(pk_text.as_ref()) + .bind(b) + .execute(&self.data_pool) + .await + } + } + .map_err(|e| StorageError::Internal(e.to_string()))?; + Ok(None) + } + } else { + // PK-only table + if needs_tx { + let select_sql = + format!("SELECT item_data FROM {ddb_table} WHERE pk = ? FOR UPDATE"); + let delete_sql = format!("DELETE FROM {ddb_table} WHERE pk = ?"); + + let mut tx = self + .data_pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let old: Option<(serde_json::Value,)> = sqlx::query_as(&select_sql) + .bind(pk_text.as_ref()) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if let Some((ref old_json,)) = old { + let old_item: Item = json_to_item(old_json.clone())?; + match check_condition(condition, &old_item, maps) { + Ok(()) => {} + Err(StorageError::ConditionFailed(_)) => { + return Err(StorageError::ConditionFailed(Some(old_item))); + } + Err(e) => return Err(e), + } + } else { + let empty = std::collections::BTreeMap::new(); + match check_condition(condition, &empty, maps) { + Ok(()) => {} + Err(StorageError::ConditionFailed(_)) => { + return Err(StorageError::ConditionFailed(None)); + } + Err(e) => return Err(e), + } + return Ok(None); + } + + sqlx::query(&delete_sql) + .bind(pk_text.as_ref()) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + // Write stream record atomically within the transaction. + if let Some(capture) = stream { + let old_for_stream = old + .as_ref() + .map(|(v,)| json_to_item(v.clone())) + .transpose()?; + write_stream_record_in_tx( + &mut tx, + key_info, + capture, + old_for_stream.as_ref(), + None, + ) + .await?; + } + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if return_old { + old.map(|(v,)| json_to_item(v)).transpose() + } else { + Ok(None) + } + } else { + let delete_sql = format!("DELETE FROM {ddb_table} WHERE pk = ?"); + sqlx::query(&delete_sql) + .bind(pk_text.as_ref()) + .execute(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + Ok(None) + } + } + } +} diff --git a/crates/storage-tidb/src/data/index.rs b/crates/storage-tidb/src/data/index.rs new file mode 100644 index 0000000..37bb2ce --- /dev/null +++ b/crates/storage-tidb/src/data/index.rs @@ -0,0 +1,434 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Native secondary-index helpers for the `TiDB` backend. +//! +//! `TiDB` owns secondary-index maintenance. ExtendDB stores every item once in +//! the base `_ddb_*` table, exposes DynamoDB index keys as generated columns, +//! and creates a native TiDB secondary index over those generated columns plus +//! the base table key for stable pagination. + +use extenddb_core::types::{ + AttributeDefinition, AttributeValue, Item, KeySchemaElement, KeyType, ScalarAttributeType, +}; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::sk_column_n; + +use super::{all_sort_key_info, data_table_name}; + +pub(crate) struct WriteIndexKeys { + key_schema: Vec, +} + +struct GeneratedColumn { + name: String, + ddl_type: &'static str, + expression: String, +} + +/// Fetch secondary-index key schemas that must be validated on writes. +/// +/// CREATING indexes are included because TiDB's online ADD INDEX backfill will +/// observe existing base rows. Letting malformed index-key attributes into the +/// base table during that window would create permanently sparse index entries. +pub(crate) async fn fetch_write_index_key_schemas( + table_id: &str, + pool: &sqlx::MySqlPool, +) -> Result, StorageError> { + let rows: Vec<(serde_json::Value,)> = sqlx::query_as( + "SELECT key_schema FROM indexes \ + WHERE table_id = ? AND index_status IN ('ACTIVE', 'CREATING')", + ) + .bind(table_id) + .fetch_all(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + rows.into_iter() + .map(|(ks_json,)| { + let key_schema = serde_json::from_value(ks_json) + .map_err(|e| StorageError::Internal(e.to_string()))?; + Ok(WriteIndexKeys { key_schema }) + }) + .collect() +} + +pub(crate) fn validate_item_index_key_types( + item: &Item, + indexes: &[WriteIndexKeys], + attr_defs: &[AttributeDefinition], +) -> Result<(), StorageError> { + for index in indexes { + for key in &index.key_schema { + let Some(value) = item.get(&key.attribute_name) else { + continue; + }; + let expected = attr_defs + .iter() + .find(|ad| ad.attribute_name == key.attribute_name) + .map(|ad| ad.attribute_type) + .ok_or_else(|| { + StorageError::Internal(format!( + "missing attribute definition for index key {}", + key.attribute_name + )) + })?; + if !attribute_value_matches_type(value, expected) { + return Err(StorageError::Validation(format!( + "One or more parameter values were invalid: Type mismatch for key attribute {}: expected: {}", + key.attribute_name, + scalar_type_name(expected) + ))); + } + } + } + Ok(()) +} + +pub(crate) async fn create_native_secondary_index( + pool: &sqlx::MySqlPool, + table_id: &str, + index_id: &str, + index_key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], + base_key_schema: &[KeySchemaElement], + base_attr_defs: &[AttributeDefinition], +) -> Result<(), StorageError> { + let table = data_table_name(table_id); + for column in generated_columns(index_id, index_key_schema, attr_defs)? { + let ddl = format!( + "ALTER TABLE {table} ADD COLUMN IF NOT EXISTS `{}` {} AS ({}) VIRTUAL", + column.name, column.ddl_type, column.expression + ); + sqlx::query(&ddl) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + let index_name = native_index_name(index_id); + let index_columns = native_index_physical_columns( + index_id, + index_key_schema, + attr_defs, + base_key_schema, + base_attr_defs, + ); + let ddl = format!( + "CREATE INDEX IF NOT EXISTS `{index_name}` ON {table} ({})", + index_columns.join(", ") + ); + sqlx::query(&ddl) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) +} + +pub(crate) async fn drop_native_secondary_index( + pool: &sqlx::MySqlPool, + table_id: &str, + index_id: &str, + index_key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Result<(), StorageError> { + let table = data_table_name(table_id); + let index_name = native_index_name(index_id); + let drop_index = format!("DROP INDEX IF EXISTS `{index_name}` ON {table}"); + sqlx::query(&drop_index) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + for column in native_index_key_tuple_columns(index_id, index_key_schema, attr_defs) { + let ddl = format!("ALTER TABLE {table} DROP COLUMN IF EXISTS `{column}`"); + sqlx::query(&ddl) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + Ok(()) +} + +pub(crate) fn native_index_hash_column(index_id: &str) -> String { + format!("{}_pk", native_index_prefix(index_id)) +} + +pub(crate) fn native_index_sort_columns( + index_id: &str, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Vec { + all_sort_key_info(key_schema, attr_defs) + .into_iter() + .enumerate() + .map(|(i, (_, sk_type))| { + format!( + "{}_{}", + native_index_prefix(index_id), + sk_column_n(i, sk_type) + ) + }) + .collect() +} + +pub(crate) fn native_index_key_tuple_columns( + index_id: &str, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Vec { + let mut columns = vec![native_index_hash_column(index_id)]; + columns.extend(native_index_sort_columns(index_id, key_schema, attr_defs)); + columns +} + +pub(crate) fn native_index_non_null_predicates( + index_id: &str, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Vec { + native_index_key_tuple_columns(index_id, key_schema, attr_defs) + .into_iter() + .map(|column| format!("{column} IS NOT NULL")) + .collect() +} + +fn native_index_physical_columns( + index_id: &str, + index_key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], + base_key_schema: &[KeySchemaElement], + base_attr_defs: &[AttributeDefinition], +) -> Vec { + let mut columns = native_index_key_tuple_columns(index_id, index_key_schema, attr_defs); + columns.push("pk".to_owned()); + columns.extend( + all_sort_key_info(base_key_schema, base_attr_defs) + .into_iter() + .enumerate() + .map(|(i, (_, sk_type))| sk_column_n(i, sk_type)), + ); + columns +} + +fn generated_columns( + index_id: &str, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Result, StorageError> { + let mut columns = vec![GeneratedColumn { + name: native_index_hash_column(index_id), + ddl_type: "VARBINARY(2048)", + expression: hash_key_expression(key_schema, attr_defs)?, + }]; + + for (i, (attr_name, attr_type)) in all_sort_key_info(key_schema, attr_defs) + .into_iter() + .enumerate() + { + columns.push(GeneratedColumn { + name: format!( + "{}_{}", + native_index_prefix(index_id), + sk_column_n(i, attr_type) + ), + ddl_type: generated_sort_column_type(attr_type), + expression: sort_key_expression(attr_name, attr_type), + }); + } + + Ok(columns) +} + +fn hash_key_expression( + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Result { + let hash_parts = key_schema + .iter() + .filter(|ks| ks.key_type == KeyType::Hash) + .map(|ks| { + let attr_type = attribute_type(&ks.attribute_name, attr_defs)?; + Ok(key_scalar_expression(&ks.attribute_name, attr_type)) + }) + .collect::, StorageError>>()?; + + if hash_parts.len() == 1 { + return Ok(format!("CAST({} AS BINARY)", hash_parts[0])); + } + + let mut concat_parts = Vec::with_capacity(hash_parts.len() * 4); + for part in hash_parts { + concat_parts.push(format!("OCTET_LENGTH({part})")); + concat_parts.push("':'".to_owned()); + concat_parts.push(part); + concat_parts.push("','".to_owned()); + } + Ok(format!("CONCAT({})", concat_parts.join(", "))) +} + +fn sort_key_expression(attr_name: &str, attr_type: ScalarAttributeType) -> String { + let scalar = key_scalar_expression(attr_name, attr_type); + match attr_type { + ScalarAttributeType::S => format!("CAST({scalar} AS BINARY)"), + ScalarAttributeType::N => format!("CAST({scalar} AS DECIMAL(65, 30))"), + ScalarAttributeType::B => format!("FROM_BASE64({scalar})"), + } +} + +fn key_scalar_expression(attr_name: &str, attr_type: ScalarAttributeType) -> String { + format!( + "JSON_UNQUOTE(JSON_EXTRACT(item_data, {}))", + sql_string_literal(&json_attribute_type_path(attr_name, attr_type)) + ) +} + +fn generated_sort_column_type(attr_type: ScalarAttributeType) -> &'static str { + match attr_type { + ScalarAttributeType::S | ScalarAttributeType::B => "VARBINARY(1024)", + ScalarAttributeType::N => "DECIMAL(65, 30)", + } +} + +fn attribute_type( + attr_name: &str, + attr_defs: &[AttributeDefinition], +) -> Result { + attr_defs + .iter() + .find(|ad| ad.attribute_name == attr_name) + .map(|ad| ad.attribute_type) + .ok_or_else(|| { + StorageError::Internal(format!( + "missing attribute definition for index key {attr_name}" + )) + }) +} + +fn attribute_value_matches_type(value: &AttributeValue, attr_type: ScalarAttributeType) -> bool { + matches!( + (attr_type, value), + (ScalarAttributeType::S, AttributeValue::S(_)) + | (ScalarAttributeType::N, AttributeValue::N(_)) + | (ScalarAttributeType::B, AttributeValue::B(_)) + ) +} + +fn scalar_type_name(attr_type: ScalarAttributeType) -> &'static str { + match attr_type { + ScalarAttributeType::S => "S", + ScalarAttributeType::N => "N", + ScalarAttributeType::B => "B", + } +} + +fn native_index_prefix(index_id: &str) -> String { + let suffix: String = index_id + .chars() + .filter(char::is_ascii_alphanumeric) + .collect(); + format!("edbidx_{suffix}") +} + +fn native_index_name(index_id: &str) -> String { + let suffix: String = index_id + .chars() + .filter(char::is_ascii_alphanumeric) + .collect(); + format!("idx_{suffix}") +} + +fn json_attribute_type_path(attr_name: &str, attr_type: ScalarAttributeType) -> String { + format!( + "$.\"{}\".\"{}\"", + json_path_key_escape(attr_name), + scalar_type_name(attr_type) + ) +} + +fn json_path_key_escape(value: &str) -> String { + value.replace('\\', "\\\\").replace('"', "\\\"") +} + +fn sql_string_literal(value: &str) -> String { + format!("'{}'", value.replace('\\', "\\\\").replace('\'', "''")) +} + +#[cfg(test)] +mod tests { + use extenddb_core::types::{ + AttributeDefinition, KeySchemaElement, KeyType, ScalarAttributeType, + }; + + use super::{hash_key_expression, native_index_hash_column, native_index_key_tuple_columns}; + + #[test] + fn native_index_columns_are_stable_and_identifier_safe() { + let index_id = "2f98c5ac-6c16-4418-b607-cd56ffc1b7a5"; + assert_eq!( + native_index_hash_column(index_id), + "edbidx_2f98c5ac6c164418b607cd56ffc1b7a5_pk" + ); + } + + #[test] + fn native_index_tuple_uses_generated_hash_and_typed_range_columns() { + let ks = vec![ + KeySchemaElement { + attribute_name: "gpk".to_owned(), + key_type: KeyType::Hash, + }, + KeySchemaElement { + attribute_name: "gsk".to_owned(), + key_type: KeyType::Range, + }, + ]; + let attrs = vec![ + AttributeDefinition { + attribute_name: "gpk".to_owned(), + attribute_type: ScalarAttributeType::S, + }, + AttributeDefinition { + attribute_name: "gsk".to_owned(), + attribute_type: ScalarAttributeType::N, + }, + ]; + + assert_eq!( + native_index_key_tuple_columns("idx-1", &ks, &attrs), + vec!["edbidx_idx1_pk".to_owned(), "edbidx_idx1_sk_n".to_owned()] + ); + } + + #[test] + fn multipart_hash_expression_matches_netstring_shape() { + let ks = vec![ + KeySchemaElement { + attribute_name: "a".to_owned(), + key_type: KeyType::Hash, + }, + KeySchemaElement { + attribute_name: "b".to_owned(), + key_type: KeyType::Hash, + }, + ]; + let attrs = vec![ + AttributeDefinition { + attribute_name: "a".to_owned(), + attribute_type: ScalarAttributeType::S, + }, + AttributeDefinition { + attribute_name: "b".to_owned(), + attribute_type: ScalarAttributeType::B, + }, + ]; + + let expr = hash_key_expression(&ks, &attrs).expect("hash expression"); + assert!(expr.starts_with("CONCAT(")); + assert!(expr.contains("OCTET_LENGTH(JSON_UNQUOTE(JSON_EXTRACT")); + assert!(expr.contains("$.\"a\".\"S\"")); + assert!(expr.contains("$.\"b\".\"B\"")); + } +} diff --git a/crates/storage-tidb/src/data/mod.rs b/crates/storage-tidb/src/data/mod.rs new file mode 100755 index 0000000..6f360eb --- /dev/null +++ b/crates/storage-tidb/src/data/mod.rs @@ -0,0 +1,158 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Per-DynamoDB-table DDL and item CRUD for the `TiDB` backend. +//! +//! Each Virtual `DynamoDB` table maps to a `TiDB` table named `_ddb_`. +//! Partition keys are stored as bytes. Sort keys use typed columns (`sk_s`, `sk_n`, `sk_b`) +//! for correct ordering. The full item is stored as JSON in `item_data`. + +use extenddb_core::types::{AttributeDefinition, Item, KeySchemaElement, ScalarAttributeType}; +use extenddb_storage::error::StorageError; + +/// SQL table name for a Virtual `DynamoDB` table. +/// +/// Uses `_ddb_` prefix to avoid collisions with catalog metadata tables. +/// Includes `account_id` for multi-account isolation (Phase 12a). +/// Table names are validated at the engine layer (alphanumeric + `_.-`), +/// so this is safe for identifier construction. +pub(crate) fn data_table_name(table_id: &str) -> String { + format!("`{}`", physical_data_table_name(table_id)) +} + +/// Raw TiDB table name for a Virtual `DynamoDB` table. +pub(crate) fn physical_data_table_name(table_id: &str) -> String { + format!("_ddb_{table_id}") +} + +/// Look up all RANGE key attribute definitions from the key schema (preserving order). +pub(crate) fn all_sort_key_info<'a>( + key_schema: &'a [KeySchemaElement], + attr_defs: &'a [AttributeDefinition], +) -> Vec<(&'a str, ScalarAttributeType)> { + key_schema + .iter() + .filter(|ks| ks.key_type == extenddb_core::types::KeyType::Range) + .filter_map(|ks| { + attr_defs + .iter() + .find(|ad| ad.attribute_name == ks.attribute_name) + .map(|ad| (ks.attribute_name.as_str(), ad.attribute_type)) + }) + .collect() +} + +/// Deserialize an `item_data` JSON value into an `Item`. +pub(crate) fn json_to_item(v: serde_json::Value) -> Result { + serde_json::from_value(v).map_err(|e| StorageError::Internal(e.to_string())) +} + +/// Bind a `SortKeyValue` to a positional parameter in a sqlx query and execute it. +/// +/// Reduces the repeated match-on-variant-and-bind pattern across query helpers. +macro_rules! bind_sk_fetch_optional { + ($sql:expr, $pk:expr, $sk:expr, $executor:expr) => { + match $sk { + extenddb_storage::util::SortKeyValue::S(s) => { + sqlx::query_as($sql) + .bind($pk) + .bind(s.as_bytes().to_vec()) + .fetch_optional($executor) + .await + } + extenddb_storage::util::SortKeyValue::N(n) => { + sqlx::query_as($sql) + .bind($pk) + .bind(n) + .fetch_optional($executor) + .await + } + extenddb_storage::util::SortKeyValue::B(b) => { + sqlx::query_as($sql) + .bind($pk) + .bind(b) + .fetch_optional($executor) + .await + } + } + .map_err(|e| extenddb_storage::error::StorageError::Internal(e.to_string())) + }; +} + +macro_rules! bind_sk_execute { + ($sql:expr, $pk:expr, $sk:expr, $item_json:expr, $executor:expr) => { + match $sk { + extenddb_storage::util::SortKeyValue::S(s) => { + sqlx::query($sql) + .bind($pk) + .bind(s.as_bytes().to_vec()) + .bind($item_json) + .execute($executor) + .await + } + extenddb_storage::util::SortKeyValue::N(n) => { + sqlx::query($sql) + .bind($pk) + .bind(n) + .bind($item_json) + .execute($executor) + .await + } + extenddb_storage::util::SortKeyValue::B(b) => { + sqlx::query($sql) + .bind($pk) + .bind(b) + .bind($item_json) + .execute($executor) + .await + } + } + .map_err(|e| extenddb_storage::error::StorageError::Internal(e.to_string())) + }; +} + +macro_rules! bind_sk_update_execute { + ($sql:expr, $item_json:expr, $pk:expr, $sk:expr, $executor:expr) => { + match $sk { + extenddb_storage::util::SortKeyValue::S(s) => { + sqlx::query($sql) + .bind($item_json) + .bind($pk) + .bind(s.as_bytes().to_vec()) + .execute($executor) + .await + } + extenddb_storage::util::SortKeyValue::N(n) => { + sqlx::query($sql) + .bind($item_json) + .bind($pk) + .bind(n) + .execute($executor) + .await + } + extenddb_storage::util::SortKeyValue::B(b) => { + sqlx::query($sql) + .bind($item_json) + .bind($pk) + .bind(b) + .execute($executor) + .await + } + } + .map_err(|e| extenddb_storage::error::StorageError::Internal(e.to_string())) + }; +} + +// Submodules declared after macros so they can use bind_sk_fetch_optional/bind_sk_execute. +mod data_engine; +mod ddl; +mod delete_item; +mod index; +mod put_item; +mod query; +mod query_scan; +mod transactions; +mod tx_helpers; +mod update_item; + +pub(crate) use tx_helpers::next_shard_sequence_in_tx; diff --git a/crates/storage-tidb/src/data/put_item.rs b/crates/storage-tidb/src/data/put_item.rs new file mode 100755 index 0000000..58c8a07 --- /dev/null +++ b/crates/storage-tidb/src/data/put_item.rs @@ -0,0 +1,295 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `put_item` and `get_item` implementations for the `TiDB` backend. + +use extenddb_core::expression::{Expr, ExpressionMaps}; +use extenddb_core::types::{Item, TableKeyInfo}; +use extenddb_storage::StreamCapture; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::{composite_pk_to_text, parse_sk, pk_to_text, sk_column, sk_info}; + +use super::index::{fetch_write_index_key_schemas, validate_item_index_key_types}; +use super::query::check_condition; +use super::tx_helpers::write_stream_record_in_tx; +use super::{data_table_name, json_to_item}; +use crate::TidbEngine; + +impl TidbEngine { + /// Implementation of `DataEngine::put_item`. + pub(crate) async fn put_item_impl( + &self, + key_info: &TableKeyInfo, + item: Item, + return_old: bool, + condition: Option<&Expr>, + maps: &ExpressionMaps, + stream: Option<&StreamCapture>, + ) -> Result, StorageError> { + let ddb_table = data_table_name(&key_info.table_id); + + let pk_text = composite_pk_to_text(&item, &key_info.key_schema)?; + + let item_json = + serde_json::to_value(&item).map_err(|e| StorageError::Internal(e.to_string()))?; + + let index_keys = fetch_write_index_key_schemas(&key_info.table_id, &self.pool).await?; + validate_item_index_key_types(&item, &index_keys, &key_info.attribute_definitions)?; + + // When there's a condition, return_old, or stream capture, we need a transaction. + let needs_tx = condition.is_some() || return_old || stream.is_some(); + + if let Some((sk_name, sk_type)) = + sk_info(&key_info.key_schema, &key_info.attribute_definitions) + { + let sk_value = item + .get(sk_name) + .ok_or_else(|| StorageError::Internal("missing sort key".to_owned()))?; + let sk = parse_sk(sk_value, sk_type)?; + let sk_col = sk_column(sk_type); + + if needs_tx { + let select_sql = format!( + "SELECT item_data FROM {ddb_table} WHERE pk = ? AND {sk_col} = ? FOR UPDATE" + ); + + let mut tx = self + .data_pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let old: Option<(serde_json::Value,)> = + bind_sk_fetch_optional!(&select_sql, pk_text.as_str(), &sk, &mut *tx)?; + + if let Some((ref old_json,)) = old { + let old_item: Item = json_to_item(old_json.clone())?; + match check_condition(condition, &old_item, maps) { + Ok(()) => {} + Err(StorageError::ConditionFailed(_)) => { + return Err(StorageError::ConditionFailed(Some(old_item))); + } + Err(e) => return Err(e), + } + // Row exists, condition passed — update in place. + let update_sql = format!( + "UPDATE {ddb_table} SET item_data = ? WHERE pk = ? AND {sk_col} = ?" + ); + bind_sk_update_execute!( + &update_sql, + &item_json, + pk_text.as_str(), + &sk, + &mut *tx + )?; + } else { + // No existing item — condition checks against empty item + let empty = std::collections::BTreeMap::new(); + match check_condition(condition, &empty, maps) { + Ok(()) => {} + Err(StorageError::ConditionFailed(_)) => { + return Err(StorageError::ConditionFailed(None)); + } + Err(e) => return Err(e), + } + // Condition passed against empty — atomic insert, fail if someone beat us. + let insert_sql = format!( + "INSERT INTO {ddb_table} (pk, {sk_col}, item_data) VALUES (?, ?, ?) \ + ON DUPLICATE KEY UPDATE pk = pk" + ); + let result = + bind_sk_execute!(&insert_sql, pk_text.as_str(), &sk, &item_json, &mut *tx)?; + if result.rows_affected() == 0 { + // Another transaction inserted between our SELECT and INSERT. + // Fetch the winner to return with ConditionFailed. + let winner: Option<(serde_json::Value,)> = + bind_sk_fetch_optional!(&select_sql, pk_text.as_str(), &sk, &mut *tx)?; + let winner_item = winner.map(|(v,)| json_to_item(v)).transpose()?; + return Err(StorageError::ConditionFailed(winner_item)); + } + } + + // Write stream record atomically within the transaction. + if let Some(capture) = stream { + let old_for_stream = old + .as_ref() + .map(|(v,)| json_to_item(v.clone())) + .transpose()?; + write_stream_record_in_tx( + &mut tx, + key_info, + capture, + old_for_stream.as_ref(), + Some(&item), + ) + .await?; + } + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if return_old { + old.map(|(v,)| json_to_item(v)).transpose() + } else { + Ok(None) + } + } else { + let upsert_sql = format!( + "INSERT INTO {ddb_table} (pk, {sk_col}, item_data) VALUES (?, ?, ?) \ + ON DUPLICATE KEY UPDATE item_data = VALUES(item_data)" + ); + bind_sk_execute!( + &upsert_sql, + pk_text.as_str(), + &sk, + &item_json, + &self.data_pool + )?; + Ok(None) + } + } else { + // No sort key — PK-only table + if needs_tx { + let select_sql = + format!("SELECT item_data FROM {ddb_table} WHERE pk = ? FOR UPDATE"); + + let mut tx = self + .data_pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let old: Option<(serde_json::Value,)> = sqlx::query_as(&select_sql) + .bind(pk_text.as_str()) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if let Some((ref old_json,)) = old { + let old_item: Item = json_to_item(old_json.clone())?; + match check_condition(condition, &old_item, maps) { + Ok(()) => {} + Err(StorageError::ConditionFailed(_)) => { + return Err(StorageError::ConditionFailed(Some(old_item))); + } + Err(e) => return Err(e), + } + // Row exists, condition passed — update in place. + let update_sql = format!("UPDATE {ddb_table} SET item_data = ? WHERE pk = ?"); + sqlx::query(&update_sql) + .bind(&item_json) + .bind(pk_text.as_str()) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } else { + let empty = std::collections::BTreeMap::new(); + match check_condition(condition, &empty, maps) { + Ok(()) => {} + Err(StorageError::ConditionFailed(_)) => { + return Err(StorageError::ConditionFailed(None)); + } + Err(e) => return Err(e), + } + // Condition passed against empty — atomic insert, fail if someone beat us. + let insert_sql = format!( + "INSERT INTO {ddb_table} (pk, item_data) VALUES (?, ?) \ + ON DUPLICATE KEY UPDATE pk = pk" + ); + let result = sqlx::query(&insert_sql) + .bind(pk_text.as_str()) + .bind(&item_json) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + if result.rows_affected() == 0 { + // Another transaction inserted between our SELECT and INSERT. + let winner: Option<(serde_json::Value,)> = sqlx::query_as(&select_sql) + .bind(pk_text.as_str()) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + let winner_item = winner.map(|(v,)| json_to_item(v)).transpose()?; + return Err(StorageError::ConditionFailed(winner_item)); + } + } + + // Write stream record atomically within the transaction. + if let Some(capture) = stream { + let old_for_stream = old + .as_ref() + .map(|(v,)| json_to_item(v.clone())) + .transpose()?; + write_stream_record_in_tx( + &mut tx, + key_info, + capture, + old_for_stream.as_ref(), + Some(&item), + ) + .await?; + } + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if return_old { + old.map(|(v,)| json_to_item(v)).transpose() + } else { + Ok(None) + } + } else { + let upsert_sql = format!( + "INSERT INTO {ddb_table} (pk, item_data) VALUES (?, ?) \ + ON DUPLICATE KEY UPDATE item_data = VALUES(item_data)" + ); + sqlx::query(&upsert_sql) + .bind(pk_text.as_str()) + .bind(&item_json) + .execute(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + Ok(None) + } + } + } + + /// Implementation of `DataEngine::get_item`. + pub(crate) async fn get_item_impl( + &self, + key_info: &TableKeyInfo, + key: &Item, + ) -> Result, StorageError> { + let ddb_table = data_table_name(&key_info.table_id); + + let pk_name = &key_info.key_schema[0].attribute_name; + let pk_value = key + .get(pk_name) + .ok_or_else(|| StorageError::Internal("missing partition key".to_owned()))?; + let pk_text = pk_to_text(pk_value)?; + + let json_opt = if let Some((sk_name, sk_type)) = + sk_info(&key_info.key_schema, &key_info.attribute_definitions) + { + let sk_value = key + .get(sk_name) + .ok_or_else(|| StorageError::Internal("missing sort key".to_owned()))?; + let sk = parse_sk(sk_value, sk_type)?; + let sk_col = sk_column(sk_type); + let sql = format!("SELECT item_data FROM {ddb_table} WHERE pk = ? AND {sk_col} = ?"); + let row: Option<(serde_json::Value,)> = + bind_sk_fetch_optional!(&sql, pk_text.as_ref(), &sk, &self.data_pool)?; + row.map(|(v,)| v) + } else { + let sql = format!("SELECT item_data FROM {ddb_table} WHERE pk = ?"); + let row: Option<(serde_json::Value,)> = sqlx::query_as(&sql) + .bind(pk_text.as_ref()) + .fetch_optional(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + row.map(|(v,)| v) + }; + + json_opt.map(json_to_item).transpose() + } +} diff --git a/crates/storage-tidb/src/data/query.rs b/crates/storage-tidb/src/data/query.rs new file mode 100755 index 0000000..dc56a4d --- /dev/null +++ b/crates/storage-tidb/src/data/query.rs @@ -0,0 +1,321 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Query and scan SQL helpers for the `TiDB` backend. +//! +//! Contains condition evaluation, sort-key SQL generation, and dynamic +//! parameter binding for `Query` and `Scan` operations. + +use extenddb_core::expression::{self, Expr, ExpressionMaps, KeyCondition, SortKeyCondition}; +use extenddb_core::types::{ + AttributeDefinition, AttributeValue, Item, KeySchemaElement, ScalarAttributeType, extract_key, +}; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::SortKeyValue; +use extenddb_storage::util::{composite_pk_to_text, parse_sk}; + +use super::all_sort_key_info; + +type JsonRowsQuery<'q> = + sqlx::query::QueryAs<'q, sqlx::MySql, (serde_json::Value,), sqlx::mysql::MySqlArguments>; + +/// Evaluate a condition expression against an item inside a transaction. +/// +/// Returns `Ok(())` if the condition passes or is `None`. +/// Returns `Err(StorageError::ConditionFailed)` if the condition fails. +pub(crate) fn check_condition( + condition: Option<&Expr>, + item: &std::collections::BTreeMap, + maps: &ExpressionMaps, +) -> Result<(), StorageError> { + if let Some(cond) = condition { + let passed = expression::evaluate_condition(cond, item, maps) + .map_err(|e| StorageError::Validation(e.to_string()))?; + if !passed { + return Err(StorageError::ConditionFailed(None)); + } + } + Ok(()) +} + +/// Resolve an expression (placeholder) to an `AttributeValue`. +pub(crate) fn resolve_expr_to_av( + expr: &expression::Expr, + maps: &ExpressionMaps, +) -> Result { + match expr { + expression::Expr::Placeholder(name) => maps + .resolve_value(name) + .cloned() + .map_err(|e| StorageError::Validation(e.to_string())), + _ => Err(StorageError::Internal( + "expected placeholder in key condition".to_owned(), + )), + } +} + +/// SQL fragment for a sort key condition. +pub(crate) struct SkSqlInfo { + pub(crate) fragment: String, +} + +fn next_prefix_bytes(prefix: &[u8]) -> Option> { + let mut upper = prefix.to_vec(); + for i in (0..upper.len()).rev() { + if upper[i] != u8::MAX { + upper[i] += 1; + upper.truncate(i + 1); + return Some(upper); + } + } + None +} + +fn begins_with_prefix_bounds( + value: &expression::Expr, + sk_type: ScalarAttributeType, + maps: &ExpressionMaps, +) -> Result<(Vec, Option>), StorageError> { + let av = resolve_expr_to_av(value, maps)?; + let lower = match parse_sk(&av, sk_type)? { + SortKeyValue::S(s) => s.into_bytes(), + SortKeyValue::B(b) => b, + SortKeyValue::N(_) => { + return Err(StorageError::Validation( + "begins_with is not supported for numeric sort keys".to_owned(), + )); + } + }; + let upper = next_prefix_bytes(&lower); + Ok((lower, upper)) +} + +/// Build a SQL WHERE fragment for a sort key condition. +/// +/// DynamoDB sorts strings by UTF-8 byte order, not by locale. TiDB stores +/// string sort keys in `VARBINARY(1024)` columns to preserve byte ordering. +pub(crate) fn build_sk_sql( + sk_cond: &SortKeyCondition, + sk_col: &str, + sk_type: ScalarAttributeType, + maps: &ExpressionMaps, + param_idx: &mut u32, +) -> Result { + match sk_cond { + SortKeyCondition::Compare { op, .. } => { + let sql_op = match op { + expression::CompareOp::Eq => "=", + expression::CompareOp::Ne => "<>", + expression::CompareOp::Lt => "<", + expression::CompareOp::Le => "<=", + expression::CompareOp::Gt => ">", + expression::CompareOp::Ge => ">=", + }; + let frag = format!(" AND {sk_col} {sql_op} ?"); + *param_idx += 1; + Ok(SkSqlInfo { fragment: frag }) + } + SortKeyCondition::Between { .. } => { + let frag = format!(" AND {sk_col} BETWEEN ? AND ?"); + *param_idx += 2; + Ok(SkSqlInfo { fragment: frag }) + } + SortKeyCondition::BeginsWith { prefix, .. } => { + let (_lower, upper) = begins_with_prefix_bounds(prefix, sk_type, maps)?; + let frag = if upper.is_some() { + *param_idx += 2; + format!(" AND {sk_col} >= ? AND {sk_col} < ?") + } else { + *param_idx += 1; + format!(" AND {sk_col} >= ?") + }; + Ok(SkSqlInfo { fragment: frag }) + } + } +} + +/// Execute a query SQL statement with dynamic parameter binding. +#[allow(clippy::too_many_arguments)] +pub(crate) async fn execute_query_sql( + sql: &str, + pk_text: &str, + key_condition: &KeyCondition, + maps: &ExpressionMaps, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], + sk_info: Option<(&str, ScalarAttributeType)>, + extra_sk_col_indices: &[(usize, ScalarAttributeType)], + exclusive_start_key: Option<&Item>, + base_table_key: Option<(&[KeySchemaElement], &[AttributeDefinition])>, + pool: &sqlx::MySqlPool, +) -> Result, StorageError> { + let mut query = sqlx::query_as::<_, (serde_json::Value,)>(sql); + query = query.bind(pk_text.to_owned()); + + // Bind sort key condition values + if let (Some(sk_cond), Some((_, sk_type))) = (&key_condition.sk_condition, sk_info) { + query = bind_sk_condition(query, sk_cond, sk_type, maps)?; + } + + // Bind extra RANGE key equality values + for (i, &(_pos, sk_type)) in extra_sk_col_indices.iter().enumerate() { + if let Some((_, value)) = key_condition.extra_sk_conditions.get(i) { + let av = resolve_expr_to_av(value, maps)?; + let sk = parse_sk(&av, sk_type)?; + query = bind_sk_value(query, &sk); + } + } + + if let Some(start_key) = exclusive_start_key { + query = bind_sort_key_tuple(query, start_key, key_schema, attr_defs)?; + if let Some((base_key_schema, base_attr_defs)) = base_table_key { + query = bind_key_tuple(query, start_key, base_key_schema, base_attr_defs)?; + } + } + + let rows: Vec<(serde_json::Value,)> = query + .fetch_all(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(rows.into_iter().map(|(v,)| v).collect()) +} + +/// Bind sort key condition values to a query. +fn bind_sk_condition<'q>( + query: JsonRowsQuery<'q>, + sk_cond: &SortKeyCondition, + sk_type: ScalarAttributeType, + maps: &ExpressionMaps, +) -> Result, StorageError> { + match sk_cond { + SortKeyCondition::Compare { value, .. } => { + let av = resolve_expr_to_av(value, maps)?; + let sk = parse_sk(&av, sk_type)?; + Ok(bind_sk_value(query, &sk)) + } + SortKeyCondition::BeginsWith { prefix: value, .. } => { + let (lower, upper) = begins_with_prefix_bounds(value, sk_type, maps)?; + let q = query.bind(lower); + Ok(match upper { + Some(upper) => q.bind(upper), + None => q, + }) + } + SortKeyCondition::Between { low, high, .. } => { + let lo_av = resolve_expr_to_av(low, maps)?; + let hi_av = resolve_expr_to_av(high, maps)?; + let lo_sk = parse_sk(&lo_av, sk_type)?; + let hi_sk = parse_sk(&hi_av, sk_type)?; + let q = bind_sk_value(query, &lo_sk); + Ok(bind_sk_value(q, &hi_sk)) + } + } +} + +/// Bind a single `SortKeyValue` to a query. +pub(crate) fn bind_sk_value<'q>(query: JsonRowsQuery<'q>, sk: &SortKeyValue) -> JsonRowsQuery<'q> { + match sk { + SortKeyValue::S(s) => query.bind(s.as_bytes().to_vec()), + SortKeyValue::N(n) => query.bind(n.clone()), + SortKeyValue::B(b) => query.bind(b.clone()), + } +} + +pub(crate) fn bind_key_tuple<'q>( + query: JsonRowsQuery<'q>, + key: &Item, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Result, StorageError> { + let mut query = query.bind(composite_pk_to_text(key, key_schema)?); + query = bind_sort_key_tuple(query, key, key_schema, attr_defs)?; + Ok(query) +} + +fn bind_sort_key_tuple<'q>( + mut query: JsonRowsQuery<'q>, + key: &Item, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Result, StorageError> { + for (sk_name, sk_type) in all_sort_key_info(key_schema, attr_defs) { + let sk_val = key.get(sk_name).ok_or_else(|| { + StorageError::Internal(format!("missing sort key in start key: {sk_name}")) + })?; + let sk = parse_sk(sk_val, sk_type)?; + query = bind_sk_value(query, &sk); + } + Ok(query) +} + +/// Execute a scan SQL statement with dynamic parameter binding. +pub(crate) async fn execute_scan_sql( + sql: &str, + exclusive_start_key: Option<&Item>, + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], + base_table_key: Option<(&[KeySchemaElement], &[AttributeDefinition])>, + pool: &sqlx::MySqlPool, +) -> Result, StorageError> { + let mut query = sqlx::query_as::<_, (serde_json::Value,)>(sql); + + if let Some(start_key) = exclusive_start_key { + query = bind_key_tuple(query, start_key, key_schema, attr_defs)?; + if let Some((base_key_schema, base_attr_defs)) = base_table_key { + query = bind_key_tuple(query, start_key, base_key_schema, base_attr_defs)?; + } + } + + let rows: Vec<(serde_json::Value,)> = query + .fetch_all(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(rows.into_iter().map(|(v,)| v).collect()) +} + +/// Build a `LastEvaluatedKey` from an item by extracting key attributes. +pub(crate) fn build_key(item: &Item, key_schema: &[KeySchemaElement]) -> Item { + extract_key(item, key_schema) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use extenddb_core::expression::{Expr, SortKeyCondition}; + use extenddb_core::types::{AttributeValue, ScalarAttributeType}; + + use super::{build_sk_sql, next_prefix_bytes}; + + #[test] + fn prefix_upper_bound_uses_half_open_byte_range() { + assert_eq!(next_prefix_bytes(b"abc"), Some(b"abd".to_vec())); + assert_eq!(next_prefix_bytes(&[0x61, 0xff]), Some(vec![0x62])); + assert_eq!(next_prefix_bytes(&[0xff, 0xff]), None); + } + + #[test] + fn begins_with_builds_sargable_range() { + let maps = extenddb_core::expression::ExpressionMaps::new( + HashMap::new(), + HashMap::from([(":p".to_owned(), AttributeValue::S("abc".to_owned()))]), + ); + let mut param_idx = 2; + let sql = build_sk_sql( + &SortKeyCondition::BeginsWith { + path: vec![], + prefix: Expr::Placeholder(":p".to_owned()), + }, + "sk_s", + ScalarAttributeType::S, + &maps, + &mut param_idx, + ) + .expect("begins_with should compile"); + + assert_eq!(sql.fragment, " AND sk_s >= ? AND sk_s < ?"); + assert_eq!(param_idx, 4); + } +} diff --git a/crates/storage-tidb/src/data/query_scan.rs b/crates/storage-tidb/src/data/query_scan.rs new file mode 100755 index 0000000..1967904 --- /dev/null +++ b/crates/storage-tidb/src/data/query_scan.rs @@ -0,0 +1,422 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `query` and `scan` implementations for the `TiDB` backend. + +use extenddb_core::expression::{ExpressionMaps, KeyCondition}; +use extenddb_core::types::{ + AttributeDefinition, Item, KeySchemaElement, ScalarAttributeType, TableKeyInfo, +}; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::{ + encode_netstring_composite, pk_to_text, sk_column, sk_column_n, sk_info, +}; + +use super::index::{ + native_index_hash_column, native_index_key_tuple_columns, native_index_non_null_predicates, + native_index_sort_columns, +}; +use super::query::{ + build_key, build_sk_sql, execute_query_sql, execute_scan_sql, resolve_expr_to_av, +}; +use super::{all_sort_key_info, data_table_name, json_to_item}; +use crate::TidbEngine; + +fn sort_key_columns( + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Vec { + all_sort_key_info(key_schema, attr_defs) + .into_iter() + .enumerate() + .map(|(i, (_, sk_type))| sk_column_n(i, sk_type)) + .collect() +} + +fn key_tuple_columns( + key_schema: &[KeySchemaElement], + attr_defs: &[AttributeDefinition], +) -> Vec { + let mut columns = vec!["pk".to_owned()]; + columns.extend(sort_key_columns(key_schema, attr_defs)); + columns +} + +fn tuple_comparison(columns: &[String], op: &str) -> String { + debug_assert!(!columns.is_empty()); + if columns.len() == 1 { + format!("{} {op} ?", columns[0]) + } else { + format!( + "({}) {op} ({})", + columns.join(", "), + std::iter::repeat_n("?", columns.len()) + .collect::>() + .join(", ") + ) + } +} + +fn push_order_by(sql: &mut String, columns: &[String], forward: bool) { + if columns.is_empty() { + return; + } + let dir = if forward { "ASC" } else { "DESC" }; + sql.push_str(" ORDER BY "); + sql.push_str( + &columns + .iter() + .map(|col| format!("{col} {dir}")) + .collect::>() + .join(", "), + ); +} + +impl TidbEngine { + /// Implementation of `DataEngine::query`. + #[allow(clippy::too_many_arguments)] + pub(crate) async fn query_impl( + &self, + key_info: &TableKeyInfo, + key_condition: &KeyCondition, + maps: &ExpressionMaps, + forward: bool, + limit: Option, + exclusive_start_key: Option<&Item>, + index_name: Option<&str>, + ) -> Result<(Vec, Option), StorageError> { + use std::fmt::Write; + + let index_info = if let Some(idx_name) = index_name { + Some( + self.fetch_index_info_by_table_id(&key_info.table_id, idx_name) + .await?, + ) + } else { + None + }; + + let base_table_key = if index_info.is_some() { + Some( + self.fetch_base_key_schema_by_table_id(&key_info.table_id) + .await?, + ) + } else { + None + }; + + let ddb_table = data_table_name(&key_info.table_id); + let pk_column = index_info.as_ref().map_or_else( + || "pk".to_owned(), + |idx| native_index_hash_column(&idx.index_id), + ); + let sort_columns = index_info.as_ref().map_or_else( + || sort_key_columns(&key_info.key_schema, &key_info.attribute_definitions), + |idx| { + native_index_sort_columns( + &idx.index_id, + &key_info.key_schema, + &key_info.attribute_definitions, + ) + }, + ); + + // Resolve partition key value(s) — for multi-part keys, encode + // all HASH attribute values into a single composite PK text using + // netstring encoding (matching the write path in composite_pk_to_text). + let pk_text = if key_condition.extra_pk_conditions.is_empty() { + let pk_expr_val = resolve_expr_to_av(&key_condition.pk_value, maps)?; + pk_to_text(&pk_expr_val)?.into_owned() + } else { + let mut parts = Vec::with_capacity(1 + key_condition.extra_pk_conditions.len()); + let first_val = resolve_expr_to_av(&key_condition.pk_value, maps)?; + parts.push(pk_to_text(&first_val)?.into_owned()); + for (_, value) in &key_condition.extra_pk_conditions { + let val = resolve_expr_to_av(value, maps)?; + parts.push(pk_to_text(&val)?.into_owned()); + } + encode_netstring_composite(&parts) + }; + + let sk_info_val = sk_info(&key_info.key_schema, &key_info.attribute_definitions); + let all_sks = all_sort_key_info(&key_info.key_schema, &key_info.attribute_definitions); + + // Build SQL query + let mut sql = format!("SELECT item_data FROM {ddb_table} WHERE {pk_column} = ?"); + let mut param_idx: u32 = 2; + + if let Some(idx) = &index_info { + for predicate in native_index_non_null_predicates( + &idx.index_id, + &key_info.key_schema, + &key_info.attribute_definitions, + ) { + sql.push_str(" AND "); + sql.push_str(&predicate); + } + } + + // Sort key condition SQL fragment (first RANGE key). + let sk_sql_info = if let (Some(sk_cond), Some((_, sk_type))) = + (&key_condition.sk_condition, sk_info_val) + { + let sk_col = index_info + .as_ref() + .and_then(|_| sort_columns.first().cloned()) + .unwrap_or_else(|| sk_column(sk_type).to_owned()); + Some(build_sk_sql( + sk_cond, + &sk_col, + sk_type, + maps, + &mut param_idx, + )?) + } else { + None + }; + + if let Some(ref info) = sk_sql_info { + sql.push_str(&info.fragment); + } + + // Extra RANGE key equality conditions (multi-RANGE key schemas). + // Each extra SK condition is an equality on an additional RANGE attribute. + let mut extra_sk_col_indices: Vec<(usize, ScalarAttributeType)> = Vec::new(); + for (path, _value) in &key_condition.extra_sk_conditions { + let attr_name = match path.first() { + Some(extenddb_core::expression::PathElement::Attribute(name)) => { + if let Some(ref_name) = name.strip_prefix('#') { + match maps.names.get(ref_name) { + Some(resolved) => resolved.clone(), + None => { + tracing::warn!(name_ref = %ref_name, "unresolved expression attribute name in extra SK condition, skipping"); + continue; + } + } + } else { + name.clone() + } + } + _ => continue, + }; + // Find which RANGE key index this attribute corresponds to + if let Some(pos) = all_sks + .iter() + .position(|(sk_name, _)| *sk_name == attr_name) + { + // Skip index 0 — that's the primary SK handled above + if pos > 0 { + let (_, sk_type) = all_sks[pos]; + let col = index_info + .as_ref() + .and_then(|_| sort_columns.get(pos).cloned()) + .unwrap_or_else(|| sk_column_n(pos, sk_type)); + let _ = write!(sql, " AND {col} = ?"); + param_idx += 1; + extra_sk_col_indices.push((pos, sk_type)); + } + } + } + + let cursor_columns = if let Some((base_key_schema, base_attr_defs)) = &base_table_key { + let mut columns = sort_columns.clone(); + columns.extend(key_tuple_columns(base_key_schema, base_attr_defs)); + columns + } else { + sort_columns.clone() + }; + + if exclusive_start_key.is_some() { + if cursor_columns.is_empty() { + return Ok((Vec::new(), None)); + } + let op = if forward { ">" } else { "<" }; + let _ = write!(sql, " AND {}", tuple_comparison(&cursor_columns, op)); + } + + push_order_by(&mut sql, &cursor_columns, forward); + + // LIMIT — fetch one extra to detect pagination + let fetch_limit = limit.map_or(1_000_001, |l| l + 1); + let _ = write!(sql, " LIMIT {fetch_limit}"); + + // Execute with dynamic bindings + let rows = execute_query_sql( + &sql, + &pk_text, + key_condition, + maps, + &key_info.key_schema, + &key_info.attribute_definitions, + sk_info_val, + &extra_sk_col_indices, + exclusive_start_key, + base_table_key + .as_ref() + .map(|(key_schema, attr_defs)| (key_schema.as_slice(), attr_defs.as_slice())), + &self.data_pool, + ) + .await?; + + #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] + let actual_limit = limit.map_or(1_000_000_usize, |l| l.max(0) as usize); + let has_more = rows.len() > actual_limit; + let items: Vec = rows + .into_iter() + .take(actual_limit) + .map(json_to_item) + .collect::, _>>()?; + + let last_key = if has_more { + items + .last() + .map(|item| build_key(item, &key_info.key_schema)) + } else { + None + }; + + Ok((items, last_key)) + } + + /// Implementation of `DataEngine::scan`. + pub(crate) async fn scan_impl( + &self, + key_info: &TableKeyInfo, + limit: Option, + exclusive_start_key: Option<&Item>, + segment: Option, + total_segments: Option, + index_name: Option<&str>, + ) -> Result<(Vec, Option), StorageError> { + use std::fmt::Write; + + let index_info = if let Some(idx_name) = index_name { + Some( + self.fetch_index_info_by_table_id(&key_info.table_id, idx_name) + .await?, + ) + } else { + None + }; + + let base_table_key = if index_info.is_some() { + Some( + self.fetch_base_key_schema_by_table_id(&key_info.table_id) + .await?, + ) + } else { + None + }; + + let ddb_table = data_table_name(&key_info.table_id); + + let mut sql = format!("SELECT item_data FROM {ddb_table}"); + let mut conditions: Vec = Vec::new(); + if let Some(idx) = &index_info { + conditions.extend(native_index_non_null_predicates( + &idx.index_id, + &key_info.key_schema, + &key_info.attribute_definitions, + )); + } + // Parallel scan: hash-based segment assignment. + if let (Some(seg), Some(total)) = (segment, total_segments) { + let segment_column = index_info.as_ref().map_or_else( + || "pk".to_owned(), + |idx| native_index_hash_column(&idx.index_id), + ); + conditions.push(format!("CRC32({segment_column}) % {total} = {seg}")); + } + + let order_columns = if let Some((base_key_schema, base_attr_defs)) = &base_table_key { + let idx = index_info + .as_ref() + .ok_or_else(|| StorageError::Internal("missing index metadata".to_owned()))?; + let mut columns = native_index_key_tuple_columns( + &idx.index_id, + &key_info.key_schema, + &key_info.attribute_definitions, + ); + columns.extend(key_tuple_columns(base_key_schema, base_attr_defs)); + columns + } else { + key_tuple_columns(&key_info.key_schema, &key_info.attribute_definitions) + }; + + if let Some(start_key) = exclusive_start_key { + let pk_name = &key_info.key_schema[0].attribute_name; + if !start_key.contains_key(pk_name) { + return Err(StorageError::Validation( + "The provided starting key is invalid: The provided key element does not match the schema".to_owned(), + )); + } + conditions.push(tuple_comparison(&order_columns, ">")); + } + + if !conditions.is_empty() { + sql.push_str(" WHERE "); + sql.push_str(&conditions.join(" AND ")); + } + + let _ = write!(sql, " ORDER BY {}", order_columns.join(", ")); + + let fetch_limit = limit.map_or(1_000_001, |l| l + 1); + let _ = write!(sql, " LIMIT {fetch_limit}"); + + // Execute + let rows = execute_scan_sql( + &sql, + exclusive_start_key, + &key_info.key_schema, + &key_info.attribute_definitions, + base_table_key + .as_ref() + .map(|(key_schema, attr_defs)| (key_schema.as_slice(), attr_defs.as_slice())), + &self.data_pool, + ) + .await?; + + #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] + let actual_limit = limit.map_or(1_000_000_usize, |l| l.max(0) as usize); + let has_more = rows.len() > actual_limit; + let items: Vec = rows + .into_iter() + .take(actual_limit) + .map(json_to_item) + .collect::, _>>()?; + + let last_key = if has_more { + items + .last() + .map(|item| build_key(item, &key_info.key_schema)) + } else { + None + }; + + Ok((items, last_key)) + } +} + +#[cfg(test)] +mod tests { + use super::tuple_comparison; + + #[test] + fn tuple_comparison_uses_all_cursor_columns() { + let cols = vec![ + "sk_s".to_owned(), + "base_pk".to_owned(), + "base_sk_n".to_owned(), + ]; + assert_eq!( + tuple_comparison(&cols, ">"), + "(sk_s, base_pk, base_sk_n) > (?, ?, ?)" + ); + } + + #[test] + fn tuple_comparison_keeps_single_column_syntax_simple() { + let cols = vec!["base_pk".to_owned()]; + assert_eq!(tuple_comparison(&cols, "<"), "base_pk < ?"); + } +} diff --git a/crates/storage-tidb/src/data/transactions.rs b/crates/storage-tidb/src/data/transactions.rs new file mode 100755 index 0000000..6230738 --- /dev/null +++ b/crates/storage-tidb/src/data/transactions.rs @@ -0,0 +1,399 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Transactional read/write implementations for the `TiDB` backend. + +use std::collections::HashMap; + +use extenddb_core::expression::{self, ExpressionMaps}; +use extenddb_core::types::{ + AttributeValue, CancellationReason, Item, ReturnValuesOnConditionCheckFailure, +}; +use extenddb_core::validation; +use extenddb_storage::error::StorageError; +use extenddb_storage::{TransactGetOp, TransactWriteOp}; + +use super::index::{WriteIndexKeys, fetch_write_index_key_schemas, validate_item_index_key_types}; +use super::tx_helpers::{ + check_idempotency_token_in_tx, delete_item_in_tx, fetch_item_for_update, fetch_item_in_tx, + upsert_item_in_tx, write_stream_record_in_tx, +}; +use crate::TidbEngine; + +impl TidbEngine { + /// Implementation of `DataEngine::transact_get_items`. + pub(crate) async fn transact_get_items_impl( + &self, + ops: &[TransactGetOp<'_>], + ) -> Result>, StorageError> { + // Validate key types inside the transaction so mismatches produce + // TransactionCanceledException with ValidationError cancellation + // reasons, matching real DynamoDB behavior. + let mut reasons: Vec = Vec::with_capacity(ops.len()); + let mut any_failed = false; + for op in ops { + match validation::validate_key_only( + op.key, + &op.key_info.key_schema, + &op.key_info.attribute_definitions, + ) { + Ok(()) => reasons.push(CancellationReason::none()), + Err(e) => { + any_failed = true; + reasons.push(CancellationReason::validation_error(e.to_string())); + } + } + } + if any_failed { + return Err(StorageError::TransactionCanceled(reasons)); + } + + let mut tx = self + .data_pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let mut results = Vec::with_capacity(ops.len()); + for op in ops { + let item = fetch_item_in_tx(&mut tx, op.key_info, op.key).await?; + results.push(item); + } + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + Ok(results) + } + + /// Implementation of `DataEngine::transact_write_items`. + pub(crate) async fn transact_write_items_impl( + &self, + ops: &[TransactWriteOp<'_>], + token: Option<(&str, &str)>, + ) -> Result<(), StorageError> { + // Pre-fetch secondary-index key schemas for each unique table involved in the transaction. + let mut table_indexes: HashMap> = HashMap::new(); + for op in ops { + let name = transact_op_table_name(op); + if !table_indexes.contains_key(name) { + let tid = transact_op_table_id(op); + let indexes = fetch_write_index_key_schemas(tid, &self.pool).await?; + table_indexes.insert(name.to_owned(), indexes); + } + } + + let mut tx = self + .data_pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + // Check idempotency token within the transaction (BLOCKER #2 fix). + if let Some((tok, fp)) = token { + check_idempotency_token_in_tx(&mut tx, tok, fp).await?; + } + + let mut reasons: Vec = Vec::with_capacity(ops.len()); + // Collect old/new items from each op for stream records. + let mut op_items: Vec<(Option, Option)> = Vec::with_capacity(ops.len()); + let mut any_failed = false; + + for op in ops { + let indexes = &table_indexes[transact_op_table_name(op)]; + let reason = + execute_transact_write_op(&mut tx, op, indexes, self.max_item_size_bytes).await; + match reason { + Ok(items) => { + op_items.push(items); + reasons.push(CancellationReason::none()); + } + Err(TxnOpError::Cancel(r)) => { + op_items.push((None, None)); + any_failed = true; + reasons.push(r); + } + Err(TxnOpError::Storage(e)) => { + // Infrastructure error — abort the entire transaction + // without leaking internal details into cancellation reasons. + return Err(StorageError::Internal(e.to_string())); + } + } + } + + if any_failed { + return Err(StorageError::TransactionCanceled(reasons)); + } + + // Write stream records atomically within the transaction (BLOCKER #1 fix). + for (op, (old_item, new_item)) in ops.iter().zip(op_items.iter()) { + let capture = match op { + TransactWriteOp::Put { stream, .. } + | TransactWriteOp::Delete { stream, .. } + | TransactWriteOp::Update { stream, .. } => stream.as_ref(), + TransactWriteOp::ConditionCheck { .. } => None, + }; + if let Some(capture) = capture { + write_stream_record_in_tx( + &mut tx, + match op { + TransactWriteOp::Put { key_info, .. } + | TransactWriteOp::Delete { key_info, .. } + | TransactWriteOp::Update { key_info, .. } + | TransactWriteOp::ConditionCheck { key_info, .. } => key_info, + }, + capture, + old_item.as_ref(), + new_item.as_ref(), + ) + .await?; + } + } + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) + } + + /// Implementation of `DataEngine::cleanup_expired_idempotency_tokens`. + pub(crate) async fn cleanup_expired_idempotency_tokens_impl( + &self, + _max_age_seconds: i64, + ) -> Result { + // TiDB native TTL owns background retention for this table. The + // transaction write path still handles same-token expiry so client + // idempotency semantics do not depend on TTL job timing. + Ok(0) + } +} + +/// Extract the table name from a transactional write operation. +fn transact_op_table_name<'a>(op: &'a TransactWriteOp<'_>) -> &'a str { + match op { + TransactWriteOp::Put { key_info, .. } + | TransactWriteOp::Delete { key_info, .. } + | TransactWriteOp::Update { key_info, .. } + | TransactWriteOp::ConditionCheck { key_info, .. } => &key_info.table_name, + } +} + +/// Extract the table_id from a transactional write operation. +fn transact_op_table_id<'a>(op: &'a TransactWriteOp<'_>) -> &'a str { + match op { + TransactWriteOp::Put { key_info, .. } + | TransactWriteOp::Delete { key_info, .. } + | TransactWriteOp::Update { key_info, .. } + | TransactWriteOp::ConditionCheck { key_info, .. } => &key_info.table_id, + } +} + +/// Error type for individual transactional write operations. +/// +/// Separates user-driven cancellations (condition failures, validation errors) +/// from infrastructure errors (PG connection failures, serialization errors). +/// This prevents internal error details from leaking into client-visible +/// cancellation reasons (BLOCKER #3 fix). +enum TxnOpError { + /// User-driven failure — becomes a per-item cancellation reason. + Cancel(CancellationReason), + /// Infrastructure failure — bubbles up as `StorageError::Internal`. + Storage(StorageError), +} + +impl From for TxnOpError { + fn from(r: CancellationReason) -> Self { + Self::Cancel(r) + } +} + +/// Execute a single transactional write operation, including native index-key validation. +/// Returns `(old_item, new_item)` on success for stream capture. +async fn execute_transact_write_op( + tx: &mut sqlx::Transaction<'_, sqlx::MySql>, + op: &TransactWriteOp<'_>, + indexes: &[WriteIndexKeys], + max_item_size_bytes: usize, +) -> Result<(Option, Option), TxnOpError> { + match op { + TransactWriteOp::Put { + key_info, + item, + condition, + maps, + return_values_on_ccf, + .. + } => { + // Key type validation inside the transaction so mismatches produce + // TransactionCanceledException with ValidationError cancellation + // reasons, matching real DynamoDB behavior. + validation::validate_item_keys( + item, + &key_info.key_schema, + &key_info.attribute_definitions, + ) + .map_err(|e| TxnOpError::Cancel(CancellationReason::validation_error(e.to_string())))?; + validate_txn_index_key_types(item, indexes, &key_info.attribute_definitions)?; + let existing = fetch_item_for_update(tx, key_info, item) + .await + .map_err(TxnOpError::Storage)?; + let empty = Item::new(); + eval_condition( + *condition, + existing.as_ref().unwrap_or(&empty), + maps, + *return_values_on_ccf, + existing.as_ref(), + )?; + upsert_item_in_tx(tx, key_info, item) + .await + .map_err(TxnOpError::Storage)?; + Ok((existing, Some((*item).clone()))) + } + TransactWriteOp::Delete { + key_info, + key, + condition, + maps, + return_values_on_ccf, + .. + } => { + validation::validate_key_only( + key, + &key_info.key_schema, + &key_info.attribute_definitions, + ) + .map_err(|e| TxnOpError::Cancel(CancellationReason::validation_error(e.to_string())))?; + let existing = fetch_item_for_update(tx, key_info, key) + .await + .map_err(TxnOpError::Storage)?; + let empty = Item::new(); + eval_condition( + *condition, + existing.as_ref().unwrap_or(&empty), + maps, + *return_values_on_ccf, + existing.as_ref(), + )?; + delete_item_in_tx(tx, key_info, key) + .await + .map_err(TxnOpError::Storage)?; + Ok((existing, None)) + } + TransactWriteOp::Update { + key_info, + key, + actions, + condition, + maps, + return_values_on_ccf, + .. + } => { + validation::validate_key_only( + key, + &key_info.key_schema, + &key_info.attribute_definitions, + ) + .map_err(|e| TxnOpError::Cancel(CancellationReason::validation_error(e.to_string())))?; + let existing = fetch_item_for_update(tx, key_info, key) + .await + .map_err(TxnOpError::Storage)?; + let mut item = existing.clone().unwrap_or_else(|| (*key).clone()); + // Evaluate condition against empty item if non-existent (DynamoDB semantics) + let condition_item = if existing.is_some() { + &item + } else { + &std::collections::BTreeMap::new() + }; + eval_condition( + *condition, + condition_item, + maps, + *return_values_on_ccf, + existing.as_ref(), + )?; + expression::apply_update(actions, &mut item, maps).map_err(|e| { + TxnOpError::Cancel(CancellationReason::validation_error(e.to_string())) + })?; + // Validate post-update item size + validation::validate_item_size(&item, max_item_size_bytes).map_err(|e| { + TxnOpError::Cancel(CancellationReason::validation_error(e.to_string())) + })?; + validate_txn_index_key_types(&item, indexes, &key_info.attribute_definitions)?; + upsert_item_in_tx(tx, key_info, &item) + .await + .map_err(TxnOpError::Storage)?; + Ok((existing, Some(item))) + } + TransactWriteOp::ConditionCheck { + key_info, + key, + condition, + maps, + return_values_on_ccf, + } => { + validation::validate_key_only( + key, + &key_info.key_schema, + &key_info.attribute_definitions, + ) + .map_err(|e| TxnOpError::Cancel(CancellationReason::validation_error(e.to_string())))?; + let existing = fetch_item_for_update(tx, key_info, key) + .await + .map_err(TxnOpError::Storage)?; + let empty = Item::new(); + let check_against = existing.as_ref().unwrap_or(&empty); + eval_condition( + Some(condition), + check_against, + maps, + *return_values_on_ccf, + existing.as_ref(), + )?; + Ok((None, None)) + } + } +} + +fn validate_txn_index_key_types( + item: &Item, + indexes: &[WriteIndexKeys], + attr_defs: &[extenddb_core::types::AttributeDefinition], +) -> Result<(), TxnOpError> { + validate_item_index_key_types(item, indexes, attr_defs).map_err(|err| match err { + StorageError::Validation(message) => { + TxnOpError::Cancel(CancellationReason::validation_error(message)) + } + other => TxnOpError::Storage(other), + }) +} + +/// Evaluate a condition expression, returning a `CancellationReason` on failure. +/// +/// When `return_values_on_ccf` is `AllOld`, the existing item is included in the +/// cancellation reason so the client can see what caused the condition to fail. +fn eval_condition( + condition: Option<&extenddb_core::expression::Expr>, + item: &std::collections::BTreeMap, + maps: &ExpressionMaps, + return_values_on_ccf: ReturnValuesOnConditionCheckFailure, + existing: Option<&Item>, +) -> Result<(), CancellationReason> { + if let Some(cond) = condition { + let passed = expression::evaluate_condition(cond, item, maps) + .map_err(|e| CancellationReason::validation_error(e.to_string()))?; + if !passed { + let item_to_return = + if return_values_on_ccf == ReturnValuesOnConditionCheckFailure::AllOld { + existing.cloned() + } else { + None + }; + return Err(CancellationReason::condition_check_failed_with_item( + item_to_return, + )); + } + } + Ok(()) +} diff --git a/crates/storage-tidb/src/data/tx_helpers.rs b/crates/storage-tidb/src/data/tx_helpers.rs new file mode 100755 index 0000000..f92cfba --- /dev/null +++ b/crates/storage-tidb/src/data/tx_helpers.rs @@ -0,0 +1,402 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Transaction helper functions: item fetch/upsert/delete within a transaction, +//! stream record writing, and idempotency token checking. + +use base64::Engine; +use base64::engine::general_purpose::STANDARD as BASE64; +use extenddb_core::types::{ + AttributeValue, Item, StreamEventName, StreamRecord, StreamRecordData, StreamViewType, + TableKeyInfo, item_size_bytes, +}; +use extenddb_storage::StreamCapture; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::{SortKeyValue, parse_sk, pk_to_text, sk_column, sk_info}; + +use super::{data_table_name, json_to_item}; + +/// Fetch a single item within an existing transaction. +pub(super) async fn fetch_item_in_tx( + tx: &mut sqlx::Transaction<'_, sqlx::MySql>, + key_info: &TableKeyInfo, + key: &Item, +) -> Result, StorageError> { + let ddb_table = data_table_name(&key_info.table_id); + let pk_name = &key_info.key_schema[0].attribute_name; + let pk_value = key + .get(pk_name) + .ok_or_else(|| StorageError::Internal("missing partition key".to_owned()))?; + let pk_text = pk_to_text(pk_value)?; + + let json_opt = if let Some((sk_name, sk_type)) = + sk_info(&key_info.key_schema, &key_info.attribute_definitions) + { + let sk_value = key + .get(sk_name) + .ok_or_else(|| StorageError::Internal("missing sort key".to_owned()))?; + let sk = parse_sk(sk_value, sk_type)?; + let sk_col = sk_column(sk_type); + let sql = format!("SELECT item_data FROM {ddb_table} WHERE pk = ? AND {sk_col} = ?"); + let row: Option<(serde_json::Value,)> = + bind_sk_fetch_optional!(&sql, pk_text.as_ref(), &sk, &mut **tx)?; + row.map(|(v,)| v) + } else { + let sql = format!("SELECT item_data FROM {ddb_table} WHERE pk = ?"); + let row: Option<(serde_json::Value,)> = sqlx::query_as(&sql) + .bind(pk_text.as_ref()) + .fetch_optional(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + row.map(|(v,)| v) + }; + + json_opt.map(json_to_item).transpose() +} + +/// Fetch a single item with `FOR UPDATE` lock within a transaction. +pub(super) async fn fetch_item_for_update( + tx: &mut sqlx::Transaction<'_, sqlx::MySql>, + key_info: &TableKeyInfo, + key: &Item, +) -> Result, StorageError> { + let ddb_table = data_table_name(&key_info.table_id); + let pk_name = &key_info.key_schema[0].attribute_name; + let pk_value = key + .get(pk_name) + .ok_or_else(|| StorageError::Internal("missing partition key".to_owned()))?; + let pk_text = pk_to_text(pk_value)?; + + let json_opt = if let Some((sk_name, sk_type)) = + sk_info(&key_info.key_schema, &key_info.attribute_definitions) + { + let sk_value = key + .get(sk_name) + .ok_or_else(|| StorageError::Internal("missing sort key".to_owned()))?; + let sk = parse_sk(sk_value, sk_type)?; + let sk_col = sk_column(sk_type); + let sql = + format!("SELECT item_data FROM {ddb_table} WHERE pk = ? AND {sk_col} = ? FOR UPDATE"); + let row: Option<(serde_json::Value,)> = + bind_sk_fetch_optional!(&sql, pk_text.as_ref(), &sk, &mut **tx)?; + row.map(|(v,)| v) + } else { + let sql = format!("SELECT item_data FROM {ddb_table} WHERE pk = ? FOR UPDATE"); + let row: Option<(serde_json::Value,)> = sqlx::query_as(&sql) + .bind(pk_text.as_ref()) + .fetch_optional(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + row.map(|(v,)| v) + }; + + json_opt.map(json_to_item).transpose() +} + +/// Upsert an item within a transaction. +pub(super) async fn upsert_item_in_tx( + tx: &mut sqlx::Transaction<'_, sqlx::MySql>, + key_info: &TableKeyInfo, + item: &Item, +) -> Result<(), StorageError> { + let ddb_table = data_table_name(&key_info.table_id); + let pk_name = &key_info.key_schema[0].attribute_name; + let pk_value = item + .get(pk_name) + .ok_or_else(|| StorageError::Internal("missing partition key".to_owned()))?; + let pk_text = pk_to_text(pk_value)?; + let item_json = + serde_json::to_value(item).map_err(|e| StorageError::Internal(e.to_string()))?; + + if let Some((sk_name, sk_type)) = sk_info(&key_info.key_schema, &key_info.attribute_definitions) + { + let sk_value = item + .get(sk_name) + .ok_or_else(|| StorageError::Internal("missing sort key".to_owned()))?; + let sk = parse_sk(sk_value, sk_type)?; + let sk_col = sk_column(sk_type); + let sql = format!( + "INSERT INTO {ddb_table} (pk, {sk_col}, item_data) VALUES (?, ?, ?) \ + ON DUPLICATE KEY UPDATE item_data = VALUES(item_data)" + ); + bind_sk_execute!(&sql, pk_text.as_ref(), &sk, &item_json, &mut **tx)?; + } else { + let sql = format!( + "INSERT INTO {ddb_table} (pk, item_data) VALUES (?, ?) \ + ON DUPLICATE KEY UPDATE item_data = VALUES(item_data)" + ); + sqlx::query(&sql) + .bind(pk_text.as_ref()) + .bind(&item_json) + .execute(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + Ok(()) +} + +/// Delete an item by key within a transaction. +pub(super) async fn delete_item_in_tx( + tx: &mut sqlx::Transaction<'_, sqlx::MySql>, + key_info: &TableKeyInfo, + key: &Item, +) -> Result<(), StorageError> { + let ddb_table = data_table_name(&key_info.table_id); + let pk_name = &key_info.key_schema[0].attribute_name; + let pk_value = key + .get(pk_name) + .ok_or_else(|| StorageError::Internal("missing partition key".to_owned()))?; + let pk_text = pk_to_text(pk_value)?; + + if let Some((sk_name, sk_type)) = sk_info(&key_info.key_schema, &key_info.attribute_definitions) + { + let sk_value = key + .get(sk_name) + .ok_or_else(|| StorageError::Internal("missing sort key".to_owned()))?; + let sk = parse_sk(sk_value, sk_type)?; + let sk_col = sk_column(sk_type); + let sql = format!("DELETE FROM {ddb_table} WHERE pk = ? AND {sk_col} = ?"); + match &sk { + SortKeyValue::S(s) => { + sqlx::query(&sql) + .bind(pk_text.as_ref()) + .bind(s.as_bytes().to_vec()) + .execute(&mut **tx) + .await + } + SortKeyValue::N(n) => { + sqlx::query(&sql) + .bind(pk_text.as_ref()) + .bind(n) + .execute(&mut **tx) + .await + } + SortKeyValue::B(b) => { + sqlx::query(&sql) + .bind(pk_text.as_ref()) + .bind(b) + .execute(&mut **tx) + .await + } + } + .map_err(|e| StorageError::Internal(e.to_string()))?; + } else { + let sql = format!("DELETE FROM {ddb_table} WHERE pk = ?"); + sqlx::query(&sql) + .bind(pk_text.as_ref()) + .execute(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + Ok(()) +} + +/// Write a stream record within an existing transaction. +/// +/// Builds the stream record from the old/new items and the `StreamCapture` +/// parameters, assigns a shard, generates a sequence number, and inserts +/// the record — all within the caller's transaction. +/// +/// The event type is determined from the old/new items: +/// - old=None, new=Some → Insert +/// - old=Some, new=Some → Modify +/// - old=Some, new=None → Remove +/// +/// For Delete operations where the item didn't exist, no stream record is written. +#[allow(clippy::too_many_arguments)] +pub(super) async fn write_stream_record_in_tx( + tx: &mut sqlx::Transaction<'_, sqlx::MySql>, + key_info: &TableKeyInfo, + capture: &StreamCapture, + old_item: Option<&Item>, + new_item: Option<&Item>, +) -> Result<(), StorageError> { + // No stream record if nothing changed (e.g., delete of non-existent item). + let source_item = new_item.or(old_item); + let Some(source) = source_item else { + return Ok(()); + }; + + // Determine the correct event type from old/new state. + let event = match (old_item, new_item) { + (None, Some(_)) => StreamEventName::Insert, + (Some(_), Some(_)) => StreamEventName::Modify, + (Some(_), None) => StreamEventName::Remove, + // Unreachable: early return above handles (None, None). + (None, None) => return Ok(()), + }; + + // Extract key attributes. + let keys: std::collections::BTreeMap = key_info + .key_schema + .iter() + .filter_map(|ks| { + source + .get(&ks.attribute_name) + .map(|v| (ks.attribute_name.clone(), v.clone())) + }) + .collect(); + + // Build images based on view type. + let new_image = match capture.view_type { + StreamViewType::NewImage | StreamViewType::NewAndOldImages => new_item.cloned(), + _ => None, + }; + let old_image = match capture.view_type { + StreamViewType::OldImage | StreamViewType::NewAndOldImages => old_item.cloned(), + _ => None, + }; + + let size = source_item.map_or(0, |i| i64::try_from(item_size_bytes(i)).unwrap_or(i64::MAX)); + + // Assign shard within the transaction. + let pk_name = &key_info.key_schema[0].attribute_name; + let pk_str = source + .get(pk_name) + .map(|v| match v { + AttributeValue::S(s) => s.clone(), + AttributeValue::N(n) => n.clone(), + AttributeValue::B(b) => BASE64.encode(b), + _ => String::new(), + }) + .unwrap_or_default(); + + let shards: Vec<(String,)> = sqlx::query_as( + "SELECT shard_id FROM stream_shards \ + WHERE table_id = ? \ + ORDER BY shard_id", + ) + .bind(&key_info.table_id) + .fetch_all(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if shards.is_empty() { + return Err(StorageError::Internal(format!( + "stream is enabled but no shards exist for table {}", + key_info.table_name + ))); + } + + let hash = crc32fast::hash(pk_str.as_bytes()); + #[allow(clippy::cast_possible_truncation)] + let idx = (hash as usize) % shards.len(); + let shard_id = &shards[idx].0; + + // Generate monotonic sequence number within the transaction (CB-21). + let seq = next_shard_sequence_in_tx(tx, shard_id).await?; + + let record = StreamRecord { + event_id: uuid::Uuid::new_v4().to_string(), + event_name: event, + event_version: "1.1".to_owned(), + event_source: "aws:dynamodb".to_owned(), + aws_region: capture.region.to_string(), + dynamodb: StreamRecordData { + approximate_creation_date_time: i64::try_from( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ) + .unwrap_or(i64::MAX), + keys, + new_image, + old_image, + sequence_number: seq, + size_bytes: size, + stream_view_type: capture.view_type, + }, + user_identity: capture.user_identity.clone(), + }; + + let record_json = + serde_json::to_value(&record).map_err(|e| StorageError::Internal(e.to_string()))?; + + sqlx::query( + "INSERT INTO stream_records (sequence_number, shard_id, table_id, event_name, record_data) \ + VALUES (?, ?, ?, ?, ?)", + ) + .bind(&record.dynamodb.sequence_number) + .bind(shard_id) + .bind(&key_info.table_id) + .bind(format!("{:?}", record.event_name)) + .bind(&record_json) + .execute(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) +} + +pub(crate) async fn next_shard_sequence_in_tx( + tx: &mut sqlx::Transaction<'_, sqlx::MySql>, + shard_id: &str, +) -> Result { + let result = sqlx::query( + "UPDATE stream_shards \ + SET next_sequence_number = LAST_INSERT_ID(next_sequence_number + 1) \ + WHERE shard_id = ?", + ) + .bind(shard_id) + .execute(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if result.rows_affected() != 1 { + return Err(StorageError::Internal(format!( + "stream shard not found: {shard_id}" + ))); + } + + let seq_val: i64 = sqlx::query_scalar("SELECT LAST_INSERT_ID()") + .fetch_one(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(format!("{seq_val:021}")) +} + +/// Check an idempotency token within an existing transaction. +/// +/// Returns `Ok(())` for new tokens (inserted), `Err(IdempotentReplay)` for +/// matching replays, `Err(IdempotentMismatch)` for fingerprint conflicts. +pub(super) async fn check_idempotency_token_in_tx( + tx: &mut sqlx::Transaction<'_, sqlx::MySql>, + token: &str, + fingerprint: &str, +) -> Result<(), StorageError> { + let row: Option<(String,)> = sqlx::query_as( + "SELECT fingerprint FROM idempotency_tokens \ + WHERE token = ? AND created_at > DATE_SUB(CURRENT_TIMESTAMP(6), INTERVAL 600 SECOND) \ + FOR UPDATE", + ) + .bind(token) + .fetch_optional(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + match row { + Some((stored,)) if stored == fingerprint => return Err(StorageError::IdempotentReplay), + Some(_) => return Err(StorageError::IdempotentMismatch), + None => {} + } + + sqlx::query( + "DELETE FROM idempotency_tokens \ + WHERE token = ? AND created_at <= DATE_SUB(CURRENT_TIMESTAMP(6), INTERVAL 600 SECOND)", + ) + .bind(token) + .execute(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + sqlx::query("INSERT INTO idempotency_tokens (token, fingerprint) VALUES (?, ?)") + .bind(token) + .bind(fingerprint) + .execute(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) +} diff --git a/crates/storage-tidb/src/data/update_item.rs b/crates/storage-tidb/src/data/update_item.rs new file mode 100755 index 0000000..346a7fc --- /dev/null +++ b/crates/storage-tidb/src/data/update_item.rs @@ -0,0 +1,211 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `update_item` implementation for the `TiDB` backend. + +use extenddb_core::expression::{self, Expr, ExpressionMaps, UpdateAction}; +use extenddb_core::types::{Item, KeyType, TableKeyInfo}; +use extenddb_core::validation; +use extenddb_storage::StreamCapture; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::{parse_sk, pk_to_text, sk_column, sk_info}; + +use super::index::{fetch_write_index_key_schemas, validate_item_index_key_types}; +use super::query::check_condition; +use super::tx_helpers::write_stream_record_in_tx; +use super::{data_table_name, json_to_item}; +use crate::TidbEngine; + +impl TidbEngine { + /// Implementation of `DataEngine::update_item`. + #[allow(clippy::too_many_arguments)] + pub(crate) async fn update_item_impl( + &self, + key_info: &TableKeyInfo, + key: &Item, + actions: &[UpdateAction], + return_old: bool, + return_new: bool, + condition: Option<&Expr>, + maps: &ExpressionMaps, + stream: Option<&StreamCapture>, + ) -> Result<(Option, Option), StorageError> { + let ddb_table = data_table_name(&key_info.table_id); + + let pk_name = &key_info.key_schema[0].attribute_name; + let pk_value = key + .get(pk_name) + .ok_or_else(|| StorageError::Internal("missing partition key".to_owned()))?; + let pk_text = pk_to_text(pk_value)?; + + // UpdateItem always needs a transaction (read-modify-write) + let mut tx = self + .data_pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let index_keys = fetch_write_index_key_schemas(&key_info.table_id, &self.pool).await?; + + // Fetch existing item + let old_json = if let Some((sk_name, sk_type)) = + sk_info(&key_info.key_schema, &key_info.attribute_definitions) + { + let sk_value = key + .get(sk_name) + .ok_or_else(|| StorageError::Internal("missing sort key".to_owned()))?; + let sk = parse_sk(sk_value, sk_type)?; + let sk_col = sk_column(sk_type); + let select_sql = format!( + "SELECT item_data FROM {ddb_table} WHERE pk = ? AND {sk_col} = ? FOR UPDATE" + ); + let row: Option<(serde_json::Value,)> = + bind_sk_fetch_optional!(&select_sql, pk_text.as_ref(), &sk, &mut *tx)?; + row.map(|(v,)| v) + } else { + let select_sql = format!("SELECT item_data FROM {ddb_table} WHERE pk = ? FOR UPDATE"); + let row: Option<(serde_json::Value,)> = sqlx::query_as(&select_sql) + .bind(pk_text.as_ref()) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + row.map(|(v,)| v) + }; + + // Build the working item: existing or new with key attributes only (upsert) + let mut item = if let Some(json) = old_json.clone() { + json_to_item(json)? + } else { + key.clone() + }; + + // Save pre-mutation item for stream capture. + let pre_mutation_item = if stream.is_some() && old_json.is_some() { + Some(item.clone()) + } else { + None + }; + + let old_item = if return_old { Some(item.clone()) } else { None }; + + // Evaluate condition against the existing item (empty if non-existent). + // DynamoDB treats a non-existent item as having no attributes at all. + let condition_item = if old_json.is_some() { + &item + } else { + &std::collections::BTreeMap::new() + }; + match check_condition(condition, condition_item, maps) { + Ok(()) => {} + Err(StorageError::ConditionFailed(_)) => { + if old_json.is_some() { + return Err(StorageError::ConditionFailed(Some(item))); + } + return Err(StorageError::ConditionFailed(None)); + } + Err(e) => return Err(e), + } + + // Apply update actions + expression::apply_update(actions, &mut item, maps) + .map_err(|e| StorageError::Validation(e.to_string()))?; + + validate_item_index_key_types(&item, &index_keys, &key_info.attribute_definitions)?; + + // Validate post-update item size (400 KB limit) + validation::validate_item_size(&item, self.max_item_size_bytes) + .map_err(|e| StorageError::Validation(e.to_string()))?; + + let new_item = if return_new { Some(item.clone()) } else { None }; + + // Write the updated item back + let item_json = + serde_json::to_value(&item).map_err(|e| StorageError::Internal(e.to_string()))?; + + if let Some((_, sk_type)) = sk_info(&key_info.key_schema, &key_info.attribute_definitions) { + let sk_name_ref = key_info + .key_schema + .iter() + .find(|ks| ks.key_type == KeyType::Range) + .map(|ks| ks.attribute_name.as_str()) + .ok_or_else(|| StorageError::Internal("missing sort key schema".to_owned()))?; + let sk_value = key + .get(sk_name_ref) + .ok_or_else(|| StorageError::Internal("missing sort key".to_owned()))?; + let sk = parse_sk(sk_value, sk_type)?; + let sk_col = sk_column(sk_type); + if old_json.is_some() { + // Row existed — update in place. + let update_sql = + format!("UPDATE {ddb_table} SET item_data = ? WHERE pk = ? AND {sk_col} = ?"); + bind_sk_update_execute!(&update_sql, &item_json, pk_text.as_ref(), &sk, &mut *tx)?; + } else { + // Row didn't exist (upsert) — atomic insert, fail if someone beat us. + let insert_sql = format!( + "INSERT INTO {ddb_table} (pk, {sk_col}, item_data) VALUES (?, ?, ?) \ + ON DUPLICATE KEY UPDATE pk = pk" + ); + let result = + bind_sk_execute!(&insert_sql, pk_text.as_ref(), &sk, &item_json, &mut *tx)?; + if result.rows_affected() == 0 { + // Another transaction inserted between our SELECT and INSERT. + // Fetch the winner to return with ConditionFailed. + let winner_sql = + format!("SELECT item_data FROM {ddb_table} WHERE pk = ? AND {sk_col} = ?"); + let winner: Option<(serde_json::Value,)> = + bind_sk_fetch_optional!(&winner_sql, pk_text.as_ref(), &sk, &mut *tx)?; + let winner_item = winner.map(|(v,)| json_to_item(v)).transpose()?; + return Err(StorageError::ConditionFailed(winner_item)); + } + } + } else if old_json.is_some() { + let update_sql = format!("UPDATE {ddb_table} SET item_data = ? WHERE pk = ?"); + sqlx::query(&update_sql) + .bind(&item_json) + .bind(pk_text.as_ref()) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } else { + let insert_sql = format!( + "INSERT INTO {ddb_table} (pk, item_data) VALUES (?, ?) \ + ON DUPLICATE KEY UPDATE pk = pk" + ); + let result = sqlx::query(&insert_sql) + .bind(pk_text.as_ref()) + .bind(&item_json) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + if result.rows_affected() == 0 { + // Another transaction inserted between our SELECT and INSERT. + // Fetch the winner to return with ConditionFailed. + let winner_sql = format!("SELECT item_data FROM {ddb_table} WHERE pk = ?"); + let winner: Option<(serde_json::Value,)> = sqlx::query_as(&winner_sql) + .bind(pk_text.as_ref()) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + let winner_item = winner.map(|(v,)| json_to_item(v)).transpose()?; + return Err(StorageError::ConditionFailed(winner_item)); + } + } + + // Write stream record atomically within the transaction. + if let Some(capture) = stream { + write_stream_record_in_tx( + &mut tx, + key_info, + capture, + pre_mutation_item.as_ref(), + Some(&item), + ) + .await?; + } + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok((old_item, new_item)) + } +} diff --git a/crates/storage-tidb/src/delete_table.rs b/crates/storage-tidb/src/delete_table.rs new file mode 100755 index 0000000..1227d9d --- /dev/null +++ b/crates/storage-tidb/src/delete_table.rs @@ -0,0 +1,101 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `delete_table` implementation for `TidbEngine`. + +use extenddb_core::types::{DeleteTableInput, TableDescription, TableStatus}; +use extenddb_storage::error::StorageError; + +use crate::TidbEngine; +use crate::table_helpers::{IndexRow, TableRow}; + +impl TidbEngine { + /// Core implementation of `delete_table`. + pub(crate) async fn delete_table_impl( + &self, + account_id: &str, + input: DeleteTableInput, + ) -> Result { + Self::validate_account_id(account_id)?; + let mut tx = self + .pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + // Lock and fetch the row atomically with SELECT ... FOR UPDATE + let row: Option = sqlx::query_as( + r"SELECT table_name, key_schema, attribute_definitions, billing_mode, + provisioned_throughput, stream_specification, table_status, + CAST(UNIX_TIMESTAMP(creation_date_time) AS DOUBLE) as creation_epoch, + table_size_bytes, item_count, table_arn, table_id, + deletion_protection_enabled, stream_label + FROM tables WHERE account_id = ? AND table_name = ? AND table_status IN ('ACTIVE', 'CREATING') + FOR UPDATE", + ) + .bind(account_id) + .bind(&input.table_name) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let row = row.ok_or_else(|| StorageError::TableNotFound(input.table_name.clone()))?; + + // REQ: DeletionProtectionEnabled check — real DynamoDB returns ValidationException + if row.deletion_protection_enabled { + return Err(StorageError::DeletionProtected(row.table_arn.clone())); + } + + // Fetch indexes for the response description. + let index_rows: Vec = sqlx::query_as( + r"SELECT index_name, index_type, key_schema, projection, + index_status, provisioned_throughput + FROM indexes WHERE table_id = ?", + ) + .bind(&row.table_id) + .fetch_all(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + // Schedule physical cleanup through the control-plane reconciler so TiDB + // data artifacts are dropped before catalog metadata. + let delay_row: (f64,) = sqlx::query_as( + "SELECT COALESCE((SELECT CAST(value AS DOUBLE) FROM settings WHERE `key` = 'control_plane_delay_seconds'), 0.25)", + ) + .fetch_one(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + let delay_secs = delay_row.0; + + // Set DELETING status with a scheduled removal time. The control-plane + // reconciler drops TiDB data artifacts first, then removes catalog + // metadata, so a failed cleanup remains retryable. + sqlx::query( + r"UPDATE tables SET table_status = 'DELETING', + status_transition_at = DATE_ADD(CURRENT_TIMESTAMP(6), INTERVAL ? SECOND) + WHERE account_id = ? AND table_name = ?", + ) + .bind(delay_secs.max(0.0)) + .bind(account_id) + .bind(&input.table_name) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + // Wake the control plane poller so it processes the DELETING → removed + // transition without waiting for the idle timeout. + self.control_plane_notify.notify_one(); + + // Build description from the fetched row data + let desc = self.build_table_description_from_row(account_id, row, index_rows)?; + + Ok(TableDescription { + table_status: TableStatus::Deleting, + ..desc + }) + } +} diff --git a/crates/storage-tidb/src/lib.rs b/crates/storage-tidb/src/lib.rs new file mode 100755 index 0000000..cb568f1 --- /dev/null +++ b/crates/storage-tidb/src/lib.rs @@ -0,0 +1,492 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `TiDB` storage backend for extenddb. +//! +//! Implements the `TableEngine` and `DataEngine` traits from `extenddb-storage` +//! using `TiDB` via `sqlx`. All SQL uses parameterized queries exclusively +//! — no dynamic SQL, except for per-DynamoDB-table DDL where table names are +//! validated at the engine layer. + +mod admin_store; +mod authorization_store; +mod backup_engine; +mod bootstrapper; +mod catalog_store; +pub mod config; +mod create_table; +mod credential_store; +mod data; +mod delete_table; +mod management_store; +mod metadata_engine; +mod migrations; +mod operations; +mod stream_engine; +mod table_engine; +mod table_helpers; +mod throughput; +mod tidb_util; +mod ttl_worker; +mod update_table; +mod worker_store; +mod workers; + +pub use bootstrapper::TidbBootstrapper; +pub use catalog_store::TidbCatalogStore; +pub use config::TidbStorageConfig; +pub use config::parse_connection_string; +pub use credential_store::DbCredentialStore; + +// Auto-register the Tidb backend at compile time +inventory::submit! { + extenddb_storage::bootstrapper::BackendRegistration { + name: "tidb", + factory: |config_path, options| { + Box::pin(async move { + let store = TidbBootstrapper::from_config(&config_path, options).await?; + Ok(Box::new(store) as Box) + }) + } + } +} + +// Auto-register TiDB operations engine +inventory::submit! { + extenddb_storage::operations::OperationsEngineRegistration { + name: "tidb", + operations: &operations::TidbOperationsEngine, + } +} + +// Auto-register TiDB config deserializer +inventory::submit! { + extenddb_storage::config::StorageConfigRegistration { + backend: "tidb", + deserializer: |table| { + let config: TidbStorageConfig = table.clone().try_into() + .map_err(|e: toml::de::Error| format!("Failed to parse tidb config: {}", e))?; + Ok(Box::new(config) as Box) + }, + default_config: || { + Box::new(TidbStorageConfig::default()) as Box + }, + default_priority: Some(50), + } +} + +// Auto-register TiDB settings store factory +inventory::submit! { + extenddb_storage::settings_store::SettingsStoreRegistration { + backend: "tidb", + factory: |connection_string| { + let connection_string = config::sqlx_connection_string(connection_string); + Box::pin(async move { + let pool = sqlx::MySqlPool::connect(&connection_string) + .await + .map_err(|e| extenddb_storage::settings_store::SettingsStoreError::ConnectionFailed(e.to_string()))?; + Ok(Box::new(TidbCatalogStore::new(pool)) as Box) + }) + }, + } +} + +// Auto-register TiDB diagnostics store factory +inventory::submit! { + extenddb_storage::diagnostics_store::DiagnosticsStoreRegistration { + backend: "tidb", + factory: |connection_string| { + let connection_string = config::sqlx_connection_string(connection_string); + Box::pin(async move { + let pool = sqlx::MySqlPool::connect(&connection_string) + .await + .map_err(|e| extenddb_storage::diagnostics_store::DiagnosticsStoreError::ConnectionFailed(e.to_string()))?; + Ok(Box::new(TidbCatalogStore::new(pool)) as Box) + }) + }, + } +} + +use std::sync::Arc; + +use extenddb_core::version::CatalogVersion; +use extenddb_storage::error::StorageError; +use sqlx::MySqlPool; +use sqlx::mysql::MySqlPoolOptions; + +/// Expected catalog version — compiled into the binary (REQ-CAT-006, D-9). +/// +/// The tuple is the single source of truth. Use `CATALOG_VERSION.to_string()` +/// wherever a string representation is needed. +pub const CATALOG_VERSION: CatalogVersion = CatalogVersion::new(0, 0, 10); + +/// Minimum number of connections allowed per pool. +/// +/// Each DynamoDB request triggers an auth/authz query fanout against the +/// catalog pool. Pools smaller than this floor starve under concurrent load. +/// Configured values below the floor are clamped at startup with a warning. +const MIN_POOL_SIZE: u32 = 10; + +/// `TiDB` storage backend configuration. +pub struct TidbConfig { + pub connection_string: String, + pub pool_size: u32, + /// Maximum item size in bytes for post-update validation. + pub max_item_size_bytes: usize, + pub native_backup: extenddb_storage::config::NativeBackupConfig, +} + +/// `TiDB` storage backend. +/// +/// The engine no longer stores a single `account_id`. Instead, `account_id` +/// is passed per-request through the storage trait methods, enabling +/// multi-account isolation (Phase 12f). +/// +/// Uses two connection pools: `pool` for catalog metadata (tables, indexes, +/// settings, accounts, IAM) and `data_pool` for per-DynamoDB-table data +/// (`_ddb_*` tables and native generated-column secondary indexes). This separation allows the catalog and +/// data to live in different TiDB databases (Bug 1, P54). +pub struct TidbEngine { + pub(crate) pool: MySqlPool, + /// Connection pool for the data database where `_ddb_*` tables live. + pub(crate) data_pool: MySqlPool, + pub(crate) region: String, + pub(crate) max_item_size_bytes: usize, + pub(crate) native_backup: backup_engine::TidbNativeBackupConfig, + /// Wakes the control plane poller when a table enters CREATING, UPDATING, + /// or DELETING state, so transitions are processed without polling delay. + pub(crate) control_plane_notify: Arc, +} + +impl TidbEngine { + pub async fn new(config: &TidbConfig, region: &str) -> Result { + // Enforce a minimum of 10 connections per pool. Smaller values starve + // the auth/authz query fanout under concurrent load. If the configured + // value is below the floor, log a warning and clamp. + let pool_size = if config.pool_size < MIN_POOL_SIZE { + tracing::warn!( + "storage.tidb.pool_size = {} is below the minimum of {}; clamping to {}", + config.pool_size, + MIN_POOL_SIZE, + MIN_POOL_SIZE + ); + MIN_POOL_SIZE + } else { + config.pool_size + }; + + // P79/P6: Set min_connections to avoid cold-start latency on first requests. + let min_conns = pool_size.min(2); + let pool = MySqlPoolOptions::new() + .max_connections(pool_size) + .min_connections(min_conns) + .test_before_acquire(false) + .max_lifetime(std::time::Duration::from_secs(1800)) + .connect(&crate::config::sqlx_connection_string( + &config.connection_string, + )) + .await + .map_err(|e| StorageError::Connection(e.to_string()))?; + + // P54 Bug 1: Read data database connection string from catalog settings. + // Falls back to the catalog pool if no separate data database is configured. + let data_pool = match sqlx::query_as::<_, (String,)>( + "SELECT value FROM settings WHERE `key` = 'data_database_connection_string'", + ) + .fetch_optional(&pool) + .await + { + Ok(Some((data_conn,))) if !data_conn.is_empty() => MySqlPoolOptions::new() + .max_connections(pool_size) + .min_connections(min_conns) + .test_before_acquire(false) + .max_lifetime(std::time::Duration::from_secs(1800)) + .connect(&crate::config::sqlx_connection_string(&data_conn)) + .await + .map_err(|e| { + StorageError::Connection(format!("data database connection failed: {e}")) + })?, + _ => pool.clone(), + }; + + Ok(Self { + pool, + data_pool, + region: region.to_owned(), + max_item_size_bytes: config.max_item_size_bytes, + native_backup: backup_engine::TidbNativeBackupConfig::from_storage_config( + config.native_backup.clone(), + ), + control_plane_notify: Arc::new(tokio::sync::Notify::new()), + }) + } + + /// Returns a handle to the control plane notify for the background poller. + pub fn control_plane_notify(&self) -> Arc { + Arc::clone(&self.control_plane_notify) + } + + /// Defense-in-depth: validate `account_id` before use in SQL identifiers. + /// + /// `account_id` is interpolated into SQL identifiers via `data_table_name()`. + /// Called by all methods that use `data_table_name()` or `format!`-based DDL. + /// Reject values that could break quoted identifiers. + /// See `docs/adr/sql-injection-defense.md`. + pub(crate) fn validate_account_id(account_id: &str) -> Result<(), StorageError> { + if account_id.contains('`') || account_id.contains('\0') || !account_id.is_ascii() { + return Err(StorageError::Internal( + "account_id contains invalid characters for use in SQL identifiers".to_owned(), + )); + } + Ok(()) + } + + /// Validate catalog version matches the compiled-in expectation (REQ-CAT-007, D-10). + /// + /// Reads the version string from the `settings` table and parses it + /// strictly into a `CatalogVersion`. Rejects malformed strings. + /// + /// # Errors + /// + /// Returns `StorageError::CatalogNotInitialized` if the catalog tables don't exist. + /// Returns `StorageError::CatalogVersionMismatch` if the version doesn't match. + /// Returns `StorageError::Internal` if the stored version string is malformed. + pub async fn check_catalog_version(&self) -> Result<(), StorageError> { + // Check table existence via information_schema (robust, not string-matching). + let exists: (bool,) = sqlx::query_as( + "SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_name = 'settings' AND table_schema = DATABASE())", + ) + .fetch_one(&self.pool) + .await + .map_err(|e| StorageError::Connection(e.to_string()))?; + + if !exists.0 { + return Err(StorageError::CatalogNotInitialized); + } + + let row: Option<(String,)> = + sqlx::query_as("SELECT value FROM settings WHERE `key` = 'catalog_version'") + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Connection(e.to_string()))?; + + let found_str = row.ok_or(StorageError::CatalogNotInitialized)?.0; + + let found = found_str + .parse::() + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if found != CATALOG_VERSION { + return Err(StorageError::CatalogVersionMismatch { + expected: CATALOG_VERSION.to_string(), + found: found_str, + }); + } + + Ok(()) + } + + /// Query the data database name from the catalog for the startup banner (REQ-LOG-001). + /// + /// Returns `"(not configured)"` if no data database has been registered. + /// + /// # Errors + /// + /// Returns `StorageError::Connection` if the query fails. + pub async fn get_data_database_info(&self) -> Result { + let row: Option<(String,)> = + sqlx::query_as("SELECT value FROM settings WHERE `key` = 'data_database_name'") + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Connection(e.to_string()))?; + + Ok(row.map_or_else(|| "(not configured)".to_owned(), |(name,)| name)) + } + + /// Returns a reference to the data pool for use by background workers + /// that operate on `_ddb_*` tables (e.g., TTL cleanup, table size refresh). + pub fn data_pool(&self) -> &MySqlPool { + &self.data_pool + } +} + +// ============================================================================ +// ServerComponents Factory Registration +// ============================================================================ + +use extenddb_auth::BuiltinAuthProvider; +use extenddb_storage::hooks::{ServerRuntimeHooks, WorkerContext}; +use extenddb_storage::server_components::{ + BackendError, ServerComponents, ServerComponentsRegistration, +}; + +/// Backend-specific runtime hooks for TiDB. +struct TidbRuntimeHooks { + engine: Arc, + control_plane_notify: Arc, + data_db_name: String, +} + +#[async_trait::async_trait] +impl ServerRuntimeHooks for TidbRuntimeHooks { + async fn spawn_workers(&self, ctx: &WorkerContext) { + // Backend-specific workers that need TiDB internals + + // 1. Control plane transitions poller + let storage_for_poller = self.engine.clone(); + let cp_notify = self.control_plane_notify.clone(); + let catalog_store = ctx.catalog_store.clone(); + tokio::spawn(async move { + workers::poll_control_plane_transitions(storage_for_poller, cp_notify, catalog_store) + .await + }); + + // 2. Table size refresh worker + let storage_for_size = self.engine.clone(); + tokio::spawn(async move { workers::table_size_refresh_worker(storage_for_size).await }); + + // 3. TTL cleanup worker for stream-enabled user tables. TiDB native TTL + // handles internal retention tables and user tables without Streams. + let storage_for_ttl = self.engine.clone(); + let metrics = ctx.metrics.clone(); + tokio::spawn(async move { ttl_worker::ttl_cleanup_worker(storage_for_ttl, metrics).await }); + + // 4. Pool metrics worker - needs both catalog and data pools + let catalog_pool = self.engine.pool.clone(); + let data_pool = self.engine.data_pool().clone(); + let metrics = ctx.metrics.clone(); + tokio::spawn(async move { + workers::pool_metrics_worker(catalog_pool, data_pool, metrics).await + }); + } + + fn backend_info(&self) -> Option { + Some(format!("data_db={}", self.data_db_name)) + } +} + +// Register the TiDB backend factory +inventory::submit! { + ServerComponentsRegistration { + backend: "tidb", + factory: |config, region| { + let connection_string = config.connection_config().to_string(); + let max_connections = config.max_connections(); + let max_catalog_connections = config.max_catalog_connections(); + let max_item_size_bytes = config + .runtime_limits() + .map_or_else( + || extenddb_core::limits::LimitsConfig::default().max_item_size_bytes, + |limits| limits.max_item_size_bytes, + ); + let native_backup = config.native_backup_config().unwrap_or_default(); + let region = region.to_string(); + Box::pin(async move { + // Build TidbConfig from extracted values + let tidb_config = TidbConfig { + connection_string: connection_string.clone(), + pool_size: max_connections, + max_item_size_bytes, + native_backup, + }; + + // Create TidbEngine + let engine = TidbEngine::new(&tidb_config, ®ion) + .await + .map_err(|e| BackendError::ConnectionFailed { + backend: "tidb".to_string(), + details: e.to_string(), + })?; + + // Check catalog version + engine.check_catalog_version().await.map_err(|e| match e { + StorageError::CatalogVersionMismatch { expected, found } => { + BackendError::CatalogVersionMismatch { expected, found } + } + _ => BackendError::InitializationFailed(e.to_string()), + })?; + + // Recover control plane transitions (ignore errors) + match engine.process_control_plane_transitions().await { + Ok(ref t) if t.is_empty() => {} + Ok(transitions) => { + for (name, transition) in &transitions { + tracing::info!("Recovered table '{name}': {transition}"); + } + } + Err(e) => tracing::error!("Failed to recover control plane transitions: {e}"), + } + + // Get data database name for logging (before wrapping in Arc) + let data_db_name = engine + .get_data_database_info() + .await + .unwrap_or_else(|_| "(query failed)".to_owned()); + + // Get references to fields we need before wrapping + let control_plane_notify = engine.control_plane_notify.clone(); + + // Wrap engine in Arc + let engine = Arc::new(engine); + + // Create catalog store. Honors storage.tidb.catalog_pool_size, + // defaulting to pool_size when unset. Clamped to the same minimum + // as the engine pool. + let catalog_pool_size = if max_catalog_connections < MIN_POOL_SIZE { + tracing::warn!( + "storage.tidb.catalog_pool_size = {} is below the minimum of {}; clamping to {}", + max_catalog_connections, + MIN_POOL_SIZE, + MIN_POOL_SIZE + ); + MIN_POOL_SIZE + } else { + max_catalog_connections + }; + let catalog_pool = MySqlPoolOptions::new() + .max_connections(catalog_pool_size) + .min_connections(catalog_pool_size.min(2)) + .test_before_acquire(false) + .max_lifetime(std::time::Duration::from_secs(1800)) + .connect(&crate::config::sqlx_connection_string(&connection_string)) + .await + .map_err(|e| BackendError::ConnectionFailed { + backend: "tidb".to_string(), + details: format!("Failed to create catalog pool: {e}"), + })?; + + // Load encryption key + let enc_key: Option = + sqlx::query_scalar("SELECT value FROM settings WHERE `key` = 'encryption_key'") + .fetch_optional(&catalog_pool) + .await + .map_err(|e| BackendError::InitializationFailed(format!("Failed to fetch encryption key: {e}")))?; + + let catalog_store = Arc::new(match enc_key { + Some(k) => TidbCatalogStore::with_encryption_key(catalog_pool.clone(), k), + None => return Err(BackendError::MissingEncryptionKey), + }) as Arc; + + // Create auth provider + let enc_key = extenddb_storage::CatalogStore::cached_encryption_key(&*catalog_store) + .ok_or(BackendError::MissingEncryptionKey)?; + let cred_store = DbCredentialStore::new(catalog_pool.clone(), enc_key); + let auth_provider = Arc::new(BuiltinAuthProvider::new(cred_store)); + + // Create runtime hooks + let runtime_hooks = Box::new(TidbRuntimeHooks { + engine: engine.clone(), + control_plane_notify, + data_db_name, + }); + + Ok(ServerComponents { + engine, + catalog_store, + auth_provider, + runtime_hooks: Some(runtime_hooks), + }) + }) + }, + } +} diff --git a/crates/storage-tidb/src/management_store/access_keys.rs b/crates/storage-tidb/src/management_store/access_keys.rs new file mode 100755 index 0000000..0b3847b --- /dev/null +++ b/crates/storage-tidb/src/management_store/access_keys.rs @@ -0,0 +1,308 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Access key, session, and caller-tag operations for `TidbCatalogStore`. + +use extenddb_storage::management_store::{AccessKeyCreated, OpError, OpResult}; + +use crate::catalog_store::TidbCatalogStore; +use crate::tidb_util::{is_fk_violation, is_unique_violation}; + +impl TidbCatalogStore { + // ── Access keys ──────────────────────────────────────────────── + + pub(crate) async fn create_access_key_impl( + &self, + account_id: &str, + user_name: &str, + ) -> OpResult { + // P119: Use cached encryption key if available, fall back to DB query. + let enc_key: String = if let Some(cached) = self.encryption_key() { + cached.to_string() + } else { + let row: Option = + sqlx::query_scalar("SELECT value FROM settings WHERE `key` = 'encryption_key'") + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("create_access_key fetch encryption key: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + row.ok_or_else(|| OpError::Internal("Encryption key not configured".to_owned()))? + }; + + let access_key_id = generate_access_key_id(); + let secret_key = generate_secret_key(); + let encrypted = encrypt_secret(&secret_key, &enc_key, &access_key_id).map_err(|e| { + tracing::error!("create_access_key encryption: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + sqlx::query( + "INSERT INTO access_keys (access_key_id, account_id, user_name, secret_key_encrypted) \ + VALUES (?, ?, ?, ?)", + ) + .bind(&access_key_id) + .bind(account_id) + .bind(user_name) + .bind(&encrypted) + .execute(self.pool()) + .await + .map_err(|e| { + if is_fk_violation(&e) { + OpError::NotFound("User not found".to_owned()) + } else { + tracing::error!("create_access_key failed: {e}"); + OpError::Internal("Database error".to_owned()) + } + })?; + + Ok(AccessKeyCreated { + access_key_id, + secret_access_key: secret_key, + }) + } + + pub(crate) async fn delete_access_key_impl( + &self, + account_id: &str, + user_name: &str, + key_id: &str, + ) -> OpResult<()> { + let result = sqlx::query( + "DELETE FROM access_keys WHERE access_key_id = ? AND account_id = ? AND user_name = ?", + ) + .bind(key_id) + .bind(account_id) + .bind(user_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("Access key not found".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("delete_access_key failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn list_access_keys_impl( + &self, + account_id: &str, + user_name: &str, + ) -> OpResult> { + sqlx::query_as( + "SELECT access_key_id, is_active, created_at FROM access_keys \ + WHERE account_id = ? AND user_name = ? ORDER BY created_at", + ) + .bind(account_id) + .bind(user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_access_keys: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn import_access_key_impl( + &self, + account_id: &str, + user_name: &str, + access_key_id: &str, + secret_access_key: &str, + ) -> OpResult<()> { + // P119: Use cached encryption key if available, fall back to DB query. + let enc_key: String = if let Some(cached) = self.encryption_key() { + cached.to_string() + } else { + let row: Option = + sqlx::query_scalar("SELECT value FROM settings WHERE `key` = 'encryption_key'") + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("import_access_key fetch encryption key: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + row.ok_or_else(|| OpError::Internal("Encryption key not configured".to_owned()))? + }; + + let encrypted = + encrypt_secret(secret_access_key, &enc_key, access_key_id).map_err(|e| { + tracing::error!("import_access_key encryption: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let result = sqlx::query( + "INSERT INTO access_keys (access_key_id, secret_key_encrypted, account_id, user_name) \ + VALUES (?, ?, ?, ?)", + ) + .bind(access_key_id) + .bind(&encrypted) + .bind(account_id) + .bind(user_name) + .execute(self.pool()) + .await; + match result { + Ok(_) => Ok(()), + Err(e) if is_fk_violation(&e) => { + Err(OpError::NotFound("IAM user not found".to_owned())) + } + Err(e) if is_unique_violation(&e) => Err(OpError::AlreadyExists( + "Access key ID already exists".to_owned(), + )), + Err(e) => { + tracing::error!("import_access_key failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + // ── Sessions ─────────────────────────────────────────────────── + + #[allow(clippy::too_many_arguments)] + pub(crate) async fn store_session_impl( + &self, + session_token: &str, + access_key_id: &str, + secret_key_encrypted: &[u8], + account_id: &str, + role_name: &str, + session_name: &str, + session_tags: &Option, + session_policy: &Option, + expires_at: time::OffsetDateTime, + ) -> OpResult<()> { + sqlx::query( + "INSERT INTO iam_sessions \ + (session_token, access_key_id, secret_key_encrypted, account_id, role_name, \ + session_name, session_tags, session_policy, expires_at) \ + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ) + .bind(session_token) + .bind(access_key_id) + .bind(secret_key_encrypted) + .bind(account_id) + .bind(role_name) + .bind(session_name) + .bind(session_tags) + .bind(session_policy) + .bind(expires_at) + .execute(self.pool()) + .await + .map_err(|e| { + tracing::error!("store_session failed: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(()) + } + + // ── Caller tags ──────────────────────────────────────────────── + + pub(crate) async fn fetch_caller_tags_impl( + &self, + account_id: &str, + resource: &str, + ) -> OpResult> { + if let Some(user_name) = resource.strip_prefix("user/") { + sqlx::query_as( + "SELECT tag_key, tag_value FROM iam_user_tags \ + WHERE account_id = ? AND user_name = ?", + ) + .bind(account_id) + .bind(user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_caller_tags user: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } else if let Some(role_name) = resource.strip_prefix("role/") { + sqlx::query_as( + "SELECT tag_key, tag_value FROM iam_role_tags \ + WHERE account_id = ? AND role_name = ?", + ) + .bind(account_id) + .bind(role_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_caller_tags role: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } else if let Some(rest) = resource.strip_prefix("assumed-role/") { + let role_name = rest.split('/').next().unwrap_or(""); + if role_name.is_empty() { + return Ok(Vec::new()); + } + sqlx::query_as( + "SELECT tag_key, tag_value FROM iam_role_tags \ + WHERE account_id = ? AND role_name = ?", + ) + .bind(account_id) + .bind(role_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("fetch_caller_tags assumed-role: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } else { + Ok(Vec::new()) + } + } +} + +// ── Crypto helpers (duplicated from server::crypto to avoid circular dep) ── + +fn generate_access_key_id() -> String { + const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + let mut rng = rand::rng(); + let suffix: String = (0..8) + .map(|_| CHARSET[rand::Rng::random_range(&mut rng, 0..CHARSET.len())] as char) + .collect(); + format!("AKIAEXTENDDB{suffix}") +} + +fn generate_secret_key() -> String { + const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + let mut rng = rand::rng(); + let suffix: String = (0..32) + .map(|_| CHARSET[rand::Rng::random_range(&mut rng, 0..CHARSET.len())] as char) + .collect(); + format!("extenddb{suffix}") +} + +fn encrypt_secret(plaintext: &str, key_b64: &str, aad: &str) -> Result, String> { + use aes_gcm::Aes256Gcm; + use aes_gcm::KeyInit; + use aes_gcm::aead::Aead; + use aes_gcm::aead::Payload; + use base64::Engine; + + let key_bytes = base64::engine::general_purpose::STANDARD + .decode(key_b64) + .map_err(|e| format!("decode encryption key: {e}"))?; + + let key = aes_gcm::Key::::from_slice(&key_bytes); + let cipher = Aes256Gcm::new(key); + + let nonce_bytes: [u8; 12] = rand::random(); + let nonce = aes_gcm::Nonce::from_slice(&nonce_bytes); + + let payload = Payload { + msg: plaintext.as_bytes(), + aad: aad.as_bytes(), + }; + let ciphertext = cipher + .encrypt(nonce, payload) + .map_err(|e| format!("encrypt: {e}"))?; + + let mut result = Vec::with_capacity(12 + ciphertext.len()); + result.extend_from_slice(&nonce_bytes); + result.extend_from_slice(&ciphertext); + Ok(result) +} diff --git a/crates/storage-tidb/src/management_store/accounts.rs b/crates/storage-tidb/src/management_store/accounts.rs new file mode 100755 index 0000000..1078f84 --- /dev/null +++ b/crates/storage-tidb/src/management_store/accounts.rs @@ -0,0 +1,208 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Account management operations for `TidbCatalogStore`. + +use extenddb_storage::management_store::{AccountDetail, OpError, OpResult}; + +use crate::catalog_store::TidbCatalogStore; +use crate::tidb_util::is_unique_violation; + +impl TidbCatalogStore { + pub(crate) async fn create_account_impl( + &self, + account_id: &str, + account_name: &str, + ) -> OpResult<()> { + let result = sqlx::query("INSERT INTO accounts (account_id, account_name) VALUES (?, ?)") + .bind(account_id) + .bind(account_name) + .execute(self.pool()) + .await; + match result { + Ok(_) => Ok(()), + Err(e) if is_unique_violation(&e) => { + Err(OpError::AlreadyExists("Account already exists".to_owned())) + } + Err(e) => { + tracing::error!("create_account failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn delete_account_impl(&self, account_id: &str) -> OpResult<()> { + let mut tx = self.pool().begin().await.map_err(|e| { + tracing::error!("delete_account begin transaction: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let locked = sqlx::query_as::<_, (String,)>( + "SELECT account_id FROM accounts WHERE account_id = ? FOR UPDATE", + ) + .bind(account_id) + .fetch_optional(&mut *tx) + .await + .map_err(|e| { + tracing::error!("delete_account lock account: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + if locked.is_none() { + return Err(OpError::NotFound("Account not found".to_owned())); + } + + let has_tables: bool = + sqlx::query_scalar("SELECT EXISTS(SELECT 1 FROM tables WHERE account_id = ?)") + .bind(account_id) + .fetch_one(&mut *tx) + .await + .map_err(|e| { + tracing::error!("delete_account check tables: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + if has_tables { + return Err(OpError::HasDependents( + "Cannot delete account with existing tables. Delete all tables first.".to_owned(), + )); + } + + let r = sqlx::query("DELETE FROM accounts WHERE account_id = ?") + .bind(account_id) + .execute(&mut *tx) + .await + .map_err(|e| { + tracing::error!("delete_account delete: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + if r.rows_affected() == 0 { + return Err(OpError::NotFound("Account not found".to_owned())); + } + + tx.commit().await.map_err(|e| { + tracing::error!("delete_account commit: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + Ok(()) + } + + pub(crate) async fn list_all_accounts_impl(&self) -> OpResult> { + sqlx::query_as("SELECT account_id, account_name FROM accounts ORDER BY account_id") + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_all_accounts: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn list_all_accounts_full_impl( + &self, + ) -> OpResult> { + sqlx::query_as( + "SELECT account_id, account_name, created_at FROM accounts ORDER BY account_id", + ) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_all_accounts_full: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn list_accounts_for_impl( + &self, + account_id: &str, + ) -> OpResult> { + sqlx::query_as("SELECT account_id, account_name FROM accounts WHERE account_id = ?") + .bind(account_id) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_accounts_for: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn get_account_detail_impl( + &self, + account_id: &str, + ) -> OpResult> { + let acct: Option<(String,)> = + sqlx::query_as("SELECT account_name FROM accounts WHERE account_id = ?") + .bind(account_id) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_account_detail name: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let Some((account_name,)) = acct else { + return Ok(None); + }; + + let users: Vec<(String,)> = sqlx::query_as( + "SELECT user_name FROM iam_users WHERE account_id = ? ORDER BY user_name", + ) + .bind(account_id) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_account_detail users: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let groups: Vec<(String,)> = sqlx::query_as( + "SELECT group_name FROM iam_groups WHERE account_id = ? ORDER BY group_name", + ) + .bind(account_id) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_account_detail groups: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let roles: Vec<(String,)> = sqlx::query_as( + "SELECT role_name FROM iam_roles WHERE account_id = ? ORDER BY role_name", + ) + .bind(account_id) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_account_detail roles: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + Ok(Some(AccountDetail { + account_name, + users: users.into_iter().map(|(n,)| n).collect(), + groups: groups.into_iter().map(|(n,)| n).collect(), + roles: roles.into_iter().map(|(n,)| n).collect(), + })) + } + + pub(crate) async fn dashboard_counts_impl(&self) -> OpResult<(i64, i64)> { + let account_count: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM accounts") + .fetch_one(self.pool()) + .await + .map_err(|e| { + tracing::error!("dashboard_counts accounts: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let admin_count: i64 = sqlx::query_scalar("SELECT COUNT(*) FROM admin_users") + .fetch_one(self.pool()) + .await + .map_err(|e| { + tracing::error!("dashboard_counts admins: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + Ok((account_count, admin_count)) + } +} diff --git a/crates/storage-tidb/src/management_store/groups.rs b/crates/storage-tidb/src/management_store/groups.rs new file mode 100755 index 0000000..afc7263 --- /dev/null +++ b/crates/storage-tidb/src/management_store/groups.rs @@ -0,0 +1,198 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Group management operations for `TidbCatalogStore`. + +use extenddb_storage::management_store::{GroupDetail, OpError, OpResult}; + +use crate::catalog_store::TidbCatalogStore; +use crate::tidb_util::{is_fk_violation, is_unique_violation}; + +impl TidbCatalogStore { + pub(crate) async fn create_group_impl( + &self, + account_id: &str, + group_name: &str, + ) -> OpResult<()> { + let group_arn = format!("arn:aws:iam::{account_id}:group/{group_name}"); + let result = sqlx::query( + "INSERT INTO iam_groups (account_id, group_name, group_arn) VALUES (?, ?, ?)", + ) + .bind(account_id) + .bind(group_name) + .bind(&group_arn) + .execute(self.pool()) + .await; + match result { + Ok(_) => Ok(()), + Err(e) if is_unique_violation(&e) => Err(OpError::AlreadyExists( + "IAM group already exists".to_owned(), + )), + Err(e) if is_fk_violation(&e) => Err(OpError::NotFound("Account not found".to_owned())), + Err(e) => { + tracing::error!("create_group failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn delete_group_impl( + &self, + account_id: &str, + group_name: &str, + ) -> OpResult<()> { + let result = sqlx::query("DELETE FROM iam_groups WHERE account_id = ? AND group_name = ?") + .bind(account_id) + .bind(group_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("IAM group not found".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("delete_group failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn list_groups_impl( + &self, + account_id: &str, + ) -> OpResult> { + sqlx::query_as( + "SELECT account_id, group_name, group_arn, created_at \ + FROM iam_groups WHERE account_id = ? ORDER BY group_name", + ) + .bind(account_id) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_groups: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn get_group_detail_impl( + &self, + account_id: &str, + group_name: &str, + ) -> OpResult> { + let exists: Option<(String,)> = sqlx::query_as( + "SELECT group_name FROM iam_groups WHERE account_id = ? AND group_name = ?", + ) + .bind(account_id) + .bind(group_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_group_detail exists: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + if exists.is_none() { + return Ok(None); + } + + let members: Vec<(String,)> = sqlx::query_as( + "SELECT user_name FROM iam_group_members \ + WHERE account_id = ? AND group_name = ? ORDER BY user_name", + ) + .bind(account_id) + .bind(group_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_group_detail members: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let policies: Vec<(String,)> = sqlx::query_as( + "SELECT policy_name FROM iam_policies \ + WHERE account_id = ? AND principal_type = 'group' AND principal_name = ? \ + ORDER BY policy_name", + ) + .bind(account_id) + .bind(group_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_group_detail policies: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let all_users: Vec<(String,)> = sqlx::query_as( + "SELECT user_name FROM iam_users WHERE account_id = ? ORDER BY user_name", + ) + .bind(account_id) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_group_detail all_users: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + Ok(Some(GroupDetail { + members: members.into_iter().map(|(n,)| n).collect(), + policies: policies.into_iter().map(|(n,)| n).collect(), + all_users: all_users.into_iter().map(|(n,)| n).collect(), + })) + } + + pub(crate) async fn add_group_member_impl( + &self, + account_id: &str, + group_name: &str, + user_name: &str, + ) -> OpResult<()> { + let result = sqlx::query( + "INSERT INTO iam_group_members (account_id, group_name, user_name) VALUES (?, ?, ?)", + ) + .bind(account_id) + .bind(group_name) + .bind(user_name) + .execute(self.pool()) + .await; + match result { + Ok(_) => Ok(()), + Err(e) if is_unique_violation(&e) => Err(OpError::AlreadyExists( + "User is already a member of this group".to_owned(), + )), + Err(e) if is_fk_violation(&e) => { + Err(OpError::NotFound("Group or user not found".to_owned())) + } + Err(e) => { + tracing::error!("add_group_member failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn remove_group_member_impl( + &self, + account_id: &str, + group_name: &str, + user_name: &str, + ) -> OpResult<()> { + let result = sqlx::query( + "DELETE FROM iam_group_members WHERE account_id = ? AND group_name = ? AND user_name = ?", + ) + .bind(account_id) + .bind(group_name) + .bind(user_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("Membership not found".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("remove_group_member failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } +} diff --git a/crates/storage-tidb/src/management_store/mod.rs b/crates/storage-tidb/src/management_store/mod.rs new file mode 100755 index 0000000..0dfb0c7 --- /dev/null +++ b/crates/storage-tidb/src/management_store/mod.rs @@ -0,0 +1,586 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `ManagementStore` implementation for `TidbCatalogStore`. +//! +//! The trait impl delegates to `_impl` methods in submodules, keeping each +//! file under the 500-line limit. + +use extenddb_storage::management_store::{ + AccessKeyCreated, AccountDetail, GroupDetail, OpResult, RoleDetail, UserDetail, +}; +use futures::future::BoxFuture; + +use super::catalog_store::TidbCatalogStore; + +mod access_keys; +mod accounts; +mod groups; +mod policies; +mod roles; +mod users; + +impl extenddb_storage::management_store::ManagementStore for TidbCatalogStore { + // ── Accounts ─────────────────────────────────────────────────── + + fn create_account(&self, account_id: &str, account_name: &str) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let account_name = account_name.to_string(); + Box::pin(async move { self.create_account_impl(&account_id, &account_name).await }) + } + + fn delete_account(&self, account_id: &str) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + Box::pin(async move { self.delete_account_impl(&account_id).await }) + } + + fn list_all_accounts(&self) -> BoxFuture<'_, OpResult>> { + Box::pin(async move { self.list_all_accounts_impl().await }) + } + + fn list_all_accounts_full( + &self, + ) -> BoxFuture<'_, OpResult>> { + Box::pin(async move { self.list_all_accounts_full_impl().await }) + } + + fn list_accounts_for( + &self, + account_id: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + Box::pin(async move { self.list_accounts_for_impl(&account_id).await }) + } + + fn get_account_detail( + &self, + account_id: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + Box::pin(async move { self.get_account_detail_impl(&account_id).await }) + } + + fn dashboard_counts(&self) -> BoxFuture<'_, OpResult<(i64, i64)>> { + Box::pin(async move { self.dashboard_counts_impl().await }) + } + + // ── Users ────────────────────────────────────────────────────── + + fn create_user( + &self, + account_id: &str, + user_name: &str, + password_hash: Option<&str>, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + let password_hash = password_hash.map(|s| s.to_string()); + Box::pin(async move { + self.create_user_impl(&account_id, &user_name, password_hash.as_deref()) + .await + }) + } + + fn delete_user(&self, account_id: &str, user_name: &str) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + Box::pin(async move { self.delete_user_impl(&account_id, &user_name).await }) + } + + fn list_users( + &self, + account_id: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + Box::pin(async move { self.list_users_impl(&account_id).await }) + } + + fn get_user_detail( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + Box::pin(async move { self.get_user_detail_impl(&account_id, &user_name).await }) + } + + fn verify_iam_user_password( + &self, + account_id: &str, + user_name: &str, + password: &str, + ) -> BoxFuture<'_, OpResult> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + let password = password.to_string(); + Box::pin(async move { + self.verify_iam_user_password_impl(&account_id, &user_name, &password) + .await + }) + } + + fn change_user_password( + &self, + account_id: &str, + user_name: &str, + password_hash: &str, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + let password_hash = password_hash.to_string(); + Box::pin(async move { + self.change_user_password_impl(&account_id, &user_name, &password_hash) + .await + }) + } + + fn tag_user( + &self, + account_id: &str, + user_name: &str, + tags: &[(String, String)], + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + let tags = tags.to_vec(); + Box::pin(async move { self.tag_user_impl(&account_id, &user_name, &tags).await }) + } + + fn untag_user( + &self, + account_id: &str, + user_name: &str, + tag_keys: &[String], + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + let tag_keys = tag_keys.to_vec(); + Box::pin(async move { + self.untag_user_impl(&account_id, &user_name, &tag_keys) + .await + }) + } + + fn list_user_tags( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + Box::pin(async move { self.list_user_tags_impl(&account_id, &user_name).await }) + } + + // ── Groups ───────────────────────────────────────────────────── + + fn create_group(&self, account_id: &str, group_name: &str) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let group_name = group_name.to_string(); + Box::pin(async move { self.create_group_impl(&account_id, &group_name).await }) + } + + fn delete_group(&self, account_id: &str, group_name: &str) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let group_name = group_name.to_string(); + Box::pin(async move { self.delete_group_impl(&account_id, &group_name).await }) + } + + fn list_groups( + &self, + account_id: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + Box::pin(async move { self.list_groups_impl(&account_id).await }) + } + + fn get_group_detail( + &self, + account_id: &str, + group_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let group_name = group_name.to_string(); + Box::pin(async move { self.get_group_detail_impl(&account_id, &group_name).await }) + } + + fn add_group_member( + &self, + account_id: &str, + group_name: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let group_name = group_name.to_string(); + let user_name = user_name.to_string(); + Box::pin(async move { + self.add_group_member_impl(&account_id, &group_name, &user_name) + .await + }) + } + + fn remove_group_member( + &self, + account_id: &str, + group_name: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let group_name = group_name.to_string(); + let user_name = user_name.to_string(); + Box::pin(async move { + self.remove_group_member_impl(&account_id, &group_name, &user_name) + .await + }) + } + + // ── Roles ────────────────────────────────────────────────────── + + fn create_role( + &self, + account_id: &str, + role_name: &str, + trust_policy: &serde_json::Value, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + let trust_policy = trust_policy.clone(); + Box::pin(async move { + self.create_role_impl(&account_id, &role_name, &trust_policy) + .await + }) + } + + fn delete_role(&self, account_id: &str, role_name: &str) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + Box::pin(async move { self.delete_role_impl(&account_id, &role_name).await }) + } + + fn list_roles( + &self, + account_id: &str, + ) -> BoxFuture< + '_, + OpResult< + Vec<( + String, + String, + String, + serde_json::Value, + time::OffsetDateTime, + )>, + >, + > { + let account_id = account_id.to_string(); + Box::pin(async move { self.list_roles_impl(&account_id).await }) + } + + fn get_role_detail( + &self, + account_id: &str, + role_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + Box::pin(async move { self.get_role_detail_impl(&account_id, &role_name).await }) + } + + fn get_role_trust_policy( + &self, + account_id: &str, + role_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + Box::pin(async move { + self.get_role_trust_policy_impl(&account_id, &role_name) + .await + }) + } + + fn tag_role( + &self, + account_id: &str, + role_name: &str, + tags: &[(String, String)], + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + let tags = tags.to_vec(); + Box::pin(async move { self.tag_role_impl(&account_id, &role_name, &tags).await }) + } + + fn untag_role( + &self, + account_id: &str, + role_name: &str, + tag_keys: &[String], + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + let tag_keys = tag_keys.to_vec(); + Box::pin(async move { + self.untag_role_impl(&account_id, &role_name, &tag_keys) + .await + }) + } + + fn list_role_tags( + &self, + account_id: &str, + role_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + Box::pin(async move { self.list_role_tags_impl(&account_id, &role_name).await }) + } + + // ── Policies ─────────────────────────────────────────────────── + + fn put_policy( + &self, + account_id: &str, + principal_type: &str, + principal_name: &str, + policy_name: &str, + document: &serde_json::Value, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let principal_type = principal_type.to_string(); + let principal_name = principal_name.to_string(); + let policy_name = policy_name.to_string(); + let document = document.clone(); + Box::pin(async move { + self.put_policy_impl( + &account_id, + &principal_type, + &principal_name, + &policy_name, + &document, + ) + .await + }) + } + + fn delete_policy( + &self, + account_id: &str, + principal_type: &str, + principal_name: &str, + policy_name: &str, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let principal_type = principal_type.to_string(); + let principal_name = principal_name.to_string(); + let policy_name = policy_name.to_string(); + Box::pin(async move { + self.delete_policy_impl(&account_id, &principal_type, &principal_name, &policy_name) + .await + }) + } + + fn list_policies( + &self, + account_id: &str, + principal_type: &str, + principal_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let principal_type = principal_type.to_string(); + let principal_name = principal_name.to_string(); + Box::pin(async move { + self.list_policies_impl(&account_id, &principal_type, &principal_name) + .await + }) + } + + // ── Permissions boundaries ───────────────────────────────────── + + fn set_user_boundary( + &self, + account_id: &str, + user_name: &str, + document: &serde_json::Value, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + let document = document.clone(); + Box::pin(async move { + self.set_boundary_impl(&account_id, "user", &user_name, &document) + .await + }) + } + + fn get_user_boundary( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + Box::pin(async move { + self.get_boundary_impl(&account_id, "user", &user_name) + .await + }) + } + + fn delete_user_boundary( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + Box::pin(async move { + self.delete_boundary_impl(&account_id, "user", &user_name) + .await + }) + } + + fn set_role_boundary( + &self, + account_id: &str, + role_name: &str, + document: &serde_json::Value, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + let document = document.clone(); + Box::pin(async move { + self.set_boundary_impl(&account_id, "role", &role_name, &document) + .await + }) + } + + fn get_role_boundary( + &self, + account_id: &str, + role_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + Box::pin(async move { + self.get_boundary_impl(&account_id, "role", &role_name) + .await + }) + } + + fn delete_role_boundary( + &self, + account_id: &str, + role_name: &str, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + Box::pin(async move { + self.delete_boundary_impl(&account_id, "role", &role_name) + .await + }) + } + + // ── Access keys ──────────────────────────────────────────────── + + fn create_access_key( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + Box::pin(async move { self.create_access_key_impl(&account_id, &user_name).await }) + } + + fn delete_access_key( + &self, + account_id: &str, + user_name: &str, + key_id: &str, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + let key_id = key_id.to_string(); + Box::pin(async move { + self.delete_access_key_impl(&account_id, &user_name, &key_id) + .await + }) + } + + fn list_access_keys( + &self, + account_id: &str, + user_name: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + Box::pin(async move { self.list_access_keys_impl(&account_id, &user_name).await }) + } + + fn import_access_key( + &self, + account_id: &str, + user_name: &str, + access_key_id: &str, + secret_access_key: &str, + ) -> BoxFuture<'_, OpResult<()>> { + let account_id = account_id.to_string(); + let user_name = user_name.to_string(); + let access_key_id = access_key_id.to_string(); + let secret_access_key = secret_access_key.to_string(); + Box::pin(async move { + self.import_access_key_impl(&account_id, &user_name, &access_key_id, &secret_access_key) + .await + }) + } + + // ── Sessions ─────────────────────────────────────────────────── + + #[allow(clippy::too_many_arguments)] + fn store_session( + &self, + session_token: &str, + access_key_id: &str, + secret_key_encrypted: &[u8], + account_id: &str, + role_name: &str, + session_name: &str, + session_tags: &Option, + session_policy: &Option, + expires_at: time::OffsetDateTime, + ) -> BoxFuture<'_, OpResult<()>> { + let session_token = session_token.to_string(); + let access_key_id = access_key_id.to_string(); + let secret_key_encrypted = secret_key_encrypted.to_vec(); + let account_id = account_id.to_string(); + let role_name = role_name.to_string(); + let session_name = session_name.to_string(); + let session_tags = session_tags.clone(); + let session_policy = session_policy.clone(); + Box::pin(async move { + self.store_session_impl( + &session_token, + &access_key_id, + &secret_key_encrypted, + &account_id, + &role_name, + &session_name, + &session_tags, + &session_policy, + expires_at, + ) + .await + }) + } + + // ── Caller tags ──────────────────────────────────────────────── + + fn fetch_caller_tags( + &self, + account_id: &str, + resource: &str, + ) -> BoxFuture<'_, OpResult>> { + let account_id = account_id.to_string(); + let resource = resource.to_string(); + Box::pin(async move { self.fetch_caller_tags_impl(&account_id, &resource).await }) + } +} diff --git a/crates/storage-tidb/src/management_store/policies.rs b/crates/storage-tidb/src/management_store/policies.rs new file mode 100755 index 0000000..cde0ed7 --- /dev/null +++ b/crates/storage-tidb/src/management_store/policies.rs @@ -0,0 +1,173 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Policy and permissions-boundary operations for `TidbCatalogStore`. + +use extenddb_storage::management_store::{OpError, OpResult}; + +use crate::catalog_store::TidbCatalogStore; +use crate::tidb_util::is_fk_violation; + +impl TidbCatalogStore { + // ── Policies ─────────────────────────────────────────────────── + + pub(crate) async fn put_policy_impl( + &self, + account_id: &str, + principal_type: &str, + principal_name: &str, + policy_name: &str, + document: &serde_json::Value, + ) -> OpResult<()> { + let result = sqlx::query( + "INSERT INTO iam_policies (account_id, principal_type, principal_name, policy_name, policy_document) \ + VALUES (?, ?, ?, ?, ?) \ + ON DUPLICATE KEY UPDATE policy_document = VALUES(policy_document)", + ) + .bind(account_id) + .bind(principal_type) + .bind(principal_name) + .bind(policy_name) + .bind(document) + .execute(self.pool()) + .await; + match result { + Ok(_) => Ok(()), + Err(e) if is_fk_violation(&e) => Err(OpError::NotFound("Account not found".to_owned())), + Err(e) => { + tracing::error!("put_policy failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn delete_policy_impl( + &self, + account_id: &str, + principal_type: &str, + principal_name: &str, + policy_name: &str, + ) -> OpResult<()> { + let result = sqlx::query( + "DELETE FROM iam_policies \ + WHERE account_id = ? AND principal_type = ? AND principal_name = ? AND policy_name = ?", + ) + .bind(account_id) + .bind(principal_type) + .bind(principal_name) + .bind(policy_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("Policy not found".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("delete_policy failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn list_policies_impl( + &self, + account_id: &str, + principal_type: &str, + principal_name: &str, + ) -> OpResult> { + sqlx::query_as( + "SELECT policy_name, policy_document, created_at FROM iam_policies \ + WHERE account_id = ? AND principal_type = ? AND principal_name = ? \ + ORDER BY policy_name", + ) + .bind(account_id) + .bind(principal_type) + .bind(principal_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_policies: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + // ── Permissions boundaries ───────────────────────────────────── + + pub(crate) async fn set_boundary_impl( + &self, + account_id: &str, + principal_type: &str, + principal_name: &str, + document: &serde_json::Value, + ) -> OpResult<()> { + let result = sqlx::query( + "INSERT INTO iam_permissions_boundaries (account_id, principal_type, principal_name, policy_document) \ + VALUES (?, ?, ?, ?) \ + ON DUPLICATE KEY UPDATE policy_document = VALUES(policy_document)", + ) + .bind(account_id) + .bind(principal_type) + .bind(principal_name) + .bind(document) + .execute(self.pool()) + .await; + match result { + Ok(_) => Ok(()), + Err(e) if is_fk_violation(&e) => Err(OpError::NotFound("Account not found".to_owned())), + Err(e) => { + tracing::error!("set_boundary failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn get_boundary_impl( + &self, + account_id: &str, + principal_type: &str, + principal_name: &str, + ) -> OpResult> { + let row: Option<(serde_json::Value,)> = sqlx::query_as( + "SELECT policy_document FROM iam_permissions_boundaries \ + WHERE account_id = ? AND principal_type = ? AND principal_name = ?", + ) + .bind(account_id) + .bind(principal_type) + .bind(principal_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_boundary: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(row.map(|(doc,)| doc)) + } + + pub(crate) async fn delete_boundary_impl( + &self, + account_id: &str, + principal_type: &str, + principal_name: &str, + ) -> OpResult<()> { + let result = sqlx::query( + "DELETE FROM iam_permissions_boundaries \ + WHERE account_id = ? AND principal_type = ? AND principal_name = ?", + ) + .bind(account_id) + .bind(principal_type) + .bind(principal_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("Permissions boundary not set".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("delete_boundary failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } +} diff --git a/crates/storage-tidb/src/management_store/roles.rs b/crates/storage-tidb/src/management_store/roles.rs new file mode 100755 index 0000000..6ff2b3a --- /dev/null +++ b/crates/storage-tidb/src/management_store/roles.rs @@ -0,0 +1,249 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Role management operations for `TidbCatalogStore`. + +use extenddb_storage::management_store::{OpError, OpResult, RoleDetail}; + +use crate::catalog_store::TidbCatalogStore; +use crate::tidb_util::{is_fk_violation, is_unique_violation}; + +impl TidbCatalogStore { + pub(crate) async fn create_role_impl( + &self, + account_id: &str, + role_name: &str, + trust_policy: &serde_json::Value, + ) -> OpResult<()> { + let role_arn = format!("arn:aws:iam::{account_id}:role/{role_name}"); + let result = sqlx::query( + "INSERT INTO iam_roles (account_id, role_name, role_arn, trust_policy) \ + VALUES (?, ?, ?, ?)", + ) + .bind(account_id) + .bind(role_name) + .bind(&role_arn) + .bind(trust_policy) + .execute(self.pool()) + .await; + match result { + Ok(_) => Ok(()), + Err(e) if is_unique_violation(&e) => { + Err(OpError::AlreadyExists("IAM role already exists".to_owned())) + } + Err(e) if is_fk_violation(&e) => Err(OpError::NotFound("Account not found".to_owned())), + Err(e) => { + tracing::error!("create_role failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn delete_role_impl(&self, account_id: &str, role_name: &str) -> OpResult<()> { + let result = sqlx::query("DELETE FROM iam_roles WHERE account_id = ? AND role_name = ?") + .bind(account_id) + .bind(role_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("IAM role not found".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("delete_role failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn list_roles_impl( + &self, + account_id: &str, + ) -> OpResult< + Vec<( + String, + String, + String, + serde_json::Value, + time::OffsetDateTime, + )>, + > { + sqlx::query_as( + "SELECT account_id, role_name, role_arn, trust_policy, created_at \ + FROM iam_roles WHERE account_id = ? ORDER BY role_name", + ) + .bind(account_id) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_roles: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn get_role_detail_impl( + &self, + account_id: &str, + role_name: &str, + ) -> OpResult> { + let role: Option<(String, serde_json::Value)> = sqlx::query_as( + "SELECT role_name, trust_policy FROM iam_roles \ + WHERE account_id = ? AND role_name = ?", + ) + .bind(account_id) + .bind(role_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_role_detail role: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let Some((_, trust_policy)) = role else { + return Ok(None); + }; + + let policies: Vec<(String,)> = sqlx::query_as( + "SELECT policy_name FROM iam_policies \ + WHERE account_id = ? AND principal_type = 'role' AND principal_name = ? \ + ORDER BY policy_name", + ) + .bind(account_id) + .bind(role_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_role_detail policies: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let tags: Vec<(String, String)> = sqlx::query_as( + "SELECT tag_key, tag_value FROM iam_role_tags \ + WHERE account_id = ? AND role_name = ? ORDER BY tag_key", + ) + .bind(account_id) + .bind(role_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_role_detail tags: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + Ok(Some(RoleDetail { + trust_policy, + policies: policies.into_iter().map(|(n,)| n).collect(), + tags, + })) + } + + pub(crate) async fn get_role_trust_policy_impl( + &self, + account_id: &str, + role_name: &str, + ) -> OpResult> { + let row: Option<(serde_json::Value,)> = sqlx::query_as( + "SELECT trust_policy FROM iam_roles WHERE account_id = ? AND role_name = ?", + ) + .bind(account_id) + .bind(role_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_role_trust_policy: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(row.map(|(tp,)| tp)) + } + + // ── Role tags ────────────────────────────────────────────────── + + pub(crate) async fn tag_role_impl( + &self, + account_id: &str, + role_name: &str, + tags: &[(String, String)], + ) -> OpResult<()> { + let mut tx = self.pool().begin().await.map_err(|e| { + tracing::error!("tag_role begin: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + for (key, value) in tags { + let result = sqlx::query( + "INSERT INTO iam_role_tags (account_id, role_name, tag_key, tag_value) \ + VALUES (?, ?, ?, ?) \ + ON DUPLICATE KEY UPDATE tag_value = VALUES(tag_value)", + ) + .bind(account_id) + .bind(role_name) + .bind(key) + .bind(value) + .execute(&mut *tx) + .await; + match result { + Ok(_) => {} + Err(e) if is_fk_violation(&e) => { + return Err(OpError::NotFound("IAM role not found".to_owned())); + } + Err(e) => { + tracing::error!("tag_role failed: {e}"); + return Err(OpError::Internal("Database error".to_owned())); + } + } + } + tx.commit().await.map_err(|e| { + tracing::error!("tag_role commit: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn untag_role_impl( + &self, + account_id: &str, + role_name: &str, + tag_keys: &[String], + ) -> OpResult<()> { + let mut tx = self.pool().begin().await.map_err(|e| { + tracing::error!("untag_role begin: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + for key in tag_keys { + sqlx::query( + "DELETE FROM iam_role_tags WHERE account_id = ? AND role_name = ? AND tag_key = ?", + ) + .bind(account_id) + .bind(role_name) + .bind(key) + .execute(&mut *tx) + .await + .map_err(|e| { + tracing::error!("untag_role failed: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + } + tx.commit().await.map_err(|e| { + tracing::error!("untag_role commit: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn list_role_tags_impl( + &self, + account_id: &str, + role_name: &str, + ) -> OpResult> { + sqlx::query_as( + "SELECT tag_key, tag_value FROM iam_role_tags \ + WHERE account_id = ? AND role_name = ? ORDER BY tag_key", + ) + .bind(account_id) + .bind(role_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_role_tags: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } +} diff --git a/crates/storage-tidb/src/management_store/users.rs b/crates/storage-tidb/src/management_store/users.rs new file mode 100755 index 0000000..f17e30b --- /dev/null +++ b/crates/storage-tidb/src/management_store/users.rs @@ -0,0 +1,355 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! User management operations for `TidbCatalogStore`. + +use extenddb_storage::management_store::{OpError, OpResult, UserDetail}; + +use crate::catalog_store::TidbCatalogStore; +use crate::tidb_util::{is_fk_violation, is_unique_violation}; + +impl TidbCatalogStore { + pub(crate) async fn create_user_impl( + &self, + account_id: &str, + user_name: &str, + password_hash: Option<&str>, + ) -> OpResult<()> { + let user_arn = format!("arn:aws:iam::{account_id}:user/{user_name}"); + + let mut tx = self.pool().begin().await.map_err(|e| { + tracing::error!("create_user begin transaction: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let result = sqlx::query( + "INSERT INTO iam_users (account_id, user_name, user_arn, password_hash) \ + VALUES (?, ?, ?, ?)", + ) + .bind(account_id) + .bind(user_name) + .bind(&user_arn) + .bind(password_hash) + .execute(&mut *tx) + .await; + + match result { + Ok(_) => {} + Err(e) if is_unique_violation(&e) => { + return Err(OpError::AlreadyExists("IAM user already exists".to_owned())); + } + Err(e) if is_fk_violation(&e) => { + return Err(OpError::NotFound("Account not found".to_owned())); + } + Err(e) => { + tracing::error!("create_user failed: {e}"); + return Err(OpError::Internal("Database error".to_owned())); + } + } + + // Seed default self-service policy. + let self_service_policy = serde_json::json!({ + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Action": [ + "iam:CreateAccessKey", + "iam:DeleteAccessKey", + "iam:ListAccessKeys", + "iam:ChangePassword" + ], + "Resource": format!("arn:aws:iam::{account_id}:user/{user_name}") + }] + }); + + if let Err(e) = sqlx::query( + "INSERT INTO iam_policies (account_id, principal_type, principal_name, policy_name, policy_document) \ + VALUES (?, 'user', ?, 'SelfServicePolicy', ?) ON DUPLICATE KEY UPDATE policy_name = policy_name", + ) + .bind(account_id) + .bind(user_name) + .bind(&self_service_policy) + .execute(&mut *tx) + .await + { + tracing::error!("seed self-service policy failed: {e}"); + return Err(OpError::Internal("Database error".to_owned())); + } + + tx.commit().await.map_err(|e| { + tracing::error!("create_user commit: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + Ok(()) + } + + pub(crate) async fn delete_user_impl(&self, account_id: &str, user_name: &str) -> OpResult<()> { + let result = sqlx::query("DELETE FROM iam_users WHERE account_id = ? AND user_name = ?") + .bind(account_id) + .bind(user_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("IAM user not found".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("delete_user failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + pub(crate) async fn list_users_impl( + &self, + account_id: &str, + ) -> OpResult> { + let rows: Vec<(String, String, String, Option, time::OffsetDateTime)> = + sqlx::query_as( + "SELECT account_id, user_name, user_arn, password_hash, created_at \ + FROM iam_users WHERE account_id = ? ORDER BY user_name", + ) + .bind(account_id) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_users: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + Ok(rows + .into_iter() + .map(|(aid, un, arn, pw, ca)| (aid, un, arn, pw.is_some(), ca)) + .collect()) + } + + pub(crate) async fn get_user_detail_impl( + &self, + account_id: &str, + user_name: &str, + ) -> OpResult> { + let exists: Option<(String,)> = sqlx::query_as( + "SELECT user_name FROM iam_users WHERE account_id = ? AND user_name = ?", + ) + .bind(account_id) + .bind(user_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_user_detail exists: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + if exists.is_none() { + return Ok(None); + } + + let keys: Vec<(String, bool)> = sqlx::query_as( + "SELECT access_key_id, is_active FROM access_keys \ + WHERE account_id = ? AND user_name = ? ORDER BY access_key_id", + ) + .bind(account_id) + .bind(user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_user_detail keys: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let policies: Vec<(String,)> = sqlx::query_as( + "SELECT policy_name FROM iam_policies \ + WHERE account_id = ? AND principal_type = 'user' AND principal_name = ? \ + ORDER BY policy_name", + ) + .bind(account_id) + .bind(user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_user_detail policies: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let tags: Vec<(String, String)> = sqlx::query_as( + "SELECT tag_key, tag_value FROM iam_user_tags \ + WHERE account_id = ? AND user_name = ? ORDER BY tag_key", + ) + .bind(account_id) + .bind(user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_user_detail tags: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let groups: Vec<(String,)> = sqlx::query_as( + "SELECT group_name FROM iam_group_members \ + WHERE account_id = ? AND user_name = ? ORDER BY group_name", + ) + .bind(account_id) + .bind(user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("get_user_detail groups: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + Ok(Some(UserDetail { + keys, + policies: policies.into_iter().map(|(n,)| n).collect(), + tags, + groups: groups.into_iter().map(|(n,)| n).collect(), + })) + } + + pub(crate) async fn verify_iam_user_password_impl( + &self, + account_id: &str, + user_name: &str, + password: &str, + ) -> OpResult { + let row: Option<(String,)> = sqlx::query_as( + "SELECT password_hash FROM iam_users \ + WHERE account_id = ? AND user_name = ? AND password_hash IS NOT NULL", + ) + .bind(account_id) + .bind(user_name) + .fetch_optional(self.pool()) + .await + .map_err(|e| { + tracing::error!("verify_iam_user_password: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + + let Some((hash,)) = row else { + return Ok(false); + }; + + let pw = password.to_owned(); + Ok( + tokio::task::spawn_blocking(move || bcrypt::verify(pw, &hash).unwrap_or(false)) + .await + .unwrap_or(false), + ) + } + + pub(crate) async fn change_user_password_impl( + &self, + account_id: &str, + user_name: &str, + password_hash: &str, + ) -> OpResult<()> { + let result = sqlx::query( + "UPDATE iam_users SET password_hash = ? WHERE account_id = ? AND user_name = ?", + ) + .bind(password_hash) + .bind(account_id) + .bind(user_name) + .execute(self.pool()) + .await; + match result { + Ok(r) if r.rows_affected() == 0 => { + Err(OpError::NotFound("IAM user not found".to_owned())) + } + Ok(_) => Ok(()), + Err(e) => { + tracing::error!("change_user_password failed: {e}"); + Err(OpError::Internal("Database error".to_owned())) + } + } + } + + // ── User tags ────────────────────────────────────────────────── + + pub(crate) async fn tag_user_impl( + &self, + account_id: &str, + user_name: &str, + tags: &[(String, String)], + ) -> OpResult<()> { + let mut tx = self.pool().begin().await.map_err(|e| { + tracing::error!("tag_user begin: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + for (key, value) in tags { + let result = sqlx::query( + "INSERT INTO iam_user_tags (account_id, user_name, tag_key, tag_value) \ + VALUES (?, ?, ?, ?) \ + ON DUPLICATE KEY UPDATE tag_value = VALUES(tag_value)", + ) + .bind(account_id) + .bind(user_name) + .bind(key) + .bind(value) + .execute(&mut *tx) + .await; + match result { + Ok(_) => {} + Err(e) if is_fk_violation(&e) => { + return Err(OpError::NotFound("IAM user not found".to_owned())); + } + Err(e) => { + tracing::error!("tag_user failed: {e}"); + return Err(OpError::Internal("Database error".to_owned())); + } + } + } + tx.commit().await.map_err(|e| { + tracing::error!("tag_user commit: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn untag_user_impl( + &self, + account_id: &str, + user_name: &str, + tag_keys: &[String], + ) -> OpResult<()> { + let mut tx = self.pool().begin().await.map_err(|e| { + tracing::error!("untag_user begin: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + for key in tag_keys { + sqlx::query( + "DELETE FROM iam_user_tags WHERE account_id = ? AND user_name = ? AND tag_key = ?", + ) + .bind(account_id) + .bind(user_name) + .bind(key) + .execute(&mut *tx) + .await + .map_err(|e| { + tracing::error!("untag_user failed: {e}"); + OpError::Internal("Database error".to_owned()) + })?; + } + tx.commit().await.map_err(|e| { + tracing::error!("untag_user commit: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } + + pub(crate) async fn list_user_tags_impl( + &self, + account_id: &str, + user_name: &str, + ) -> OpResult> { + sqlx::query_as( + "SELECT tag_key, tag_value FROM iam_user_tags \ + WHERE account_id = ? AND user_name = ? ORDER BY tag_key", + ) + .bind(account_id) + .bind(user_name) + .fetch_all(self.pool()) + .await + .map_err(|e| { + tracing::error!("list_user_tags: {e}"); + OpError::Internal("Database error".to_owned()) + }) + } +} diff --git a/crates/storage-tidb/src/metadata_engine.rs b/crates/storage-tidb/src/metadata_engine.rs new file mode 100755 index 0000000..c39bf0d --- /dev/null +++ b/crates/storage-tidb/src/metadata_engine.rs @@ -0,0 +1,644 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `MetadataEngine` trait implementation for `TidbEngine`. + +use extenddb_core::types::{ + Item, StreamSpecification, Tag, TimeToLiveDescription, TimeToLiveStatus, +}; +use extenddb_storage::MetadataEngine; +use extenddb_storage::error::StorageError; +use futures::future::BoxFuture; + +use crate::TidbEngine; +use crate::data; + +const TTL_EXPIRES_AT_COLUMN: &str = "_edb_ttl_expires_at"; +const TTL_EXPIRES_AT_INDEX: &str = "_edb_ttl_expires_at_idx"; +const LEGACY_TTL_EPOCH_COLUMN: &str = "_edb_ttl_epoch"; +const LEGACY_TTL_EPOCH_INDEX: &str = "_edb_ttl_epoch_idx"; + +type TtlArtifactRow = (String, Option, Option, bool); + +fn ttl_json_path(ttl_attribute: &str) -> String { + format!( + "$.\"{}\".N", + ttl_attribute.replace('\\', "\\\\").replace('"', "\\\"") + ) +} + +fn sql_string_literal(value: &str) -> String { + format!("'{}'", value.replace('\\', "\\\\").replace('\'', "''")) +} + +fn ttl_json_value_expr(ttl_attribute: &str) -> String { + let ttl_path = sql_string_literal(&ttl_json_path(ttl_attribute)); + format!("JSON_UNQUOTE(JSON_EXTRACT(item_data, {ttl_path}))") +} + +fn ttl_expires_at_expr(ttl_attribute: &str) -> String { + let ttl_value = ttl_json_value_expr(ttl_attribute); + format!( + "CASE \ + WHEN {ttl_value} REGEXP '^[0-9]+$' \ + AND CAST({ttl_value} AS UNSIGNED) > 0 \ + THEN FROM_UNIXTIME(CAST({ttl_value} AS UNSIGNED)) \ + ELSE NULL \ + END" + ) +} + +fn stream_enabled(stream_spec_json: Option) -> Result { + stream_spec_json + .map(serde_json::from_value::) + .transpose() + .map_err(|e| StorageError::Internal(e.to_string())) + .map(|spec| spec.is_some_and(|s| s.stream_enabled)) +} + +async fn data_table_has_native_ttl( + pool: &sqlx::MySqlPool, + data_table: &str, +) -> Result { + let (_table_name, create_table): (String, String) = + sqlx::query_as(&format!("SHOW CREATE TABLE {data_table}")) + .fetch_one(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + let create_table = create_table.to_ascii_uppercase(); + Ok(create_table.contains(" TTL =") || create_table.contains("/*T![TTL] TTL =")) +} + +pub(crate) async fn drop_ttl_artifacts( + pool: &sqlx::MySqlPool, + table_id: &str, +) -> Result<(), StorageError> { + let data_table = data::data_table_name(table_id); + + if data_table_has_native_ttl(pool, &data_table).await? { + let sql = format!("ALTER TABLE {data_table} REMOVE TTL"); + sqlx::query(&sql) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + for index_name in [TTL_EXPIRES_AT_INDEX, LEGACY_TTL_EPOCH_INDEX] { + let sql = format!("DROP INDEX IF EXISTS `{index_name}` ON {data_table}"); + sqlx::query(&sql) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + for column_name in [TTL_EXPIRES_AT_COLUMN, LEGACY_TTL_EPOCH_COLUMN] { + let sql = format!("ALTER TABLE {data_table} DROP COLUMN IF EXISTS `{column_name}`"); + sqlx::query(&sql) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + Ok(()) +} + +async fn add_ttl_generated_column( + pool: &sqlx::MySqlPool, + table_id: &str, + ttl_attribute: &str, +) -> Result<(), StorageError> { + let data_table = data::data_table_name(table_id); + let ttl_expr = ttl_expires_at_expr(ttl_attribute); + let add_column = format!( + "ALTER TABLE {data_table} ADD COLUMN `{TTL_EXPIRES_AT_COLUMN}` DATETIME \ + AS ({ttl_expr}) VIRTUAL" + ); + sqlx::query(&add_column) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) +} + +async fn configure_native_ttl( + pool: &sqlx::MySqlPool, + table_id: &str, + ttl_attribute: &str, +) -> Result<(), StorageError> { + drop_ttl_artifacts(pool, table_id).await?; + add_ttl_generated_column(pool, table_id, ttl_attribute).await?; + + let data_table = data::data_table_name(table_id); + let enable_ttl = format!( + "ALTER TABLE {data_table} \ + TTL = `{TTL_EXPIRES_AT_COLUMN}` + INTERVAL 0 SECOND \ + TTL_JOB_INTERVAL = '1h'" + ); + sqlx::query(&enable_ttl) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) +} + +async fn configure_stream_ttl_index( + pool: &sqlx::MySqlPool, + table_id: &str, + ttl_attribute: &str, +) -> Result<(), StorageError> { + drop_ttl_artifacts(pool, table_id).await?; + add_ttl_generated_column(pool, table_id, ttl_attribute).await?; + + let data_table = data::data_table_name(table_id); + let add_index = format!( + "CREATE INDEX IF NOT EXISTS `{TTL_EXPIRES_AT_INDEX}` ON {data_table} (`{TTL_EXPIRES_AT_COLUMN}`)" + ); + sqlx::query(&add_index) + .execute(pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) +} + +async fn configure_ttl_artifacts( + pool: &sqlx::MySqlPool, + table_id: &str, + ttl_attribute: &str, + stream_spec_json: Option, +) -> Result { + let use_native_ttl = !stream_enabled(stream_spec_json)?; + + if use_native_ttl { + configure_native_ttl(pool, table_id, ttl_attribute).await?; + } else { + configure_stream_ttl_index(pool, table_id, ttl_attribute).await?; + } + + Ok(use_native_ttl) +} + +impl TidbEngine { + pub(crate) async fn disable_native_ttl_for_table_id( + &self, + table_id: &str, + ) -> Result<(), StorageError> { + let data_table = data::data_table_name(table_id); + if data_table_has_native_ttl(&self.data_pool, &data_table).await? { + let sql = format!("ALTER TABLE {data_table} REMOVE TTL"); + sqlx::query(&sql) + .execute(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + Ok(()) + } + + pub(crate) async fn streaming_ttl_tables_ready( + &self, + ) -> Result, StorageError> { + let rows: Vec<(String, String, String, Option)> = sqlx::query_as( + "SELECT account_id, table_name, ttl_attribute, stream_specification FROM tables \ + WHERE ttl_attribute IS NOT NULL AND ttl_index_ready = TRUE AND table_status = 'ACTIVE'", + ) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + rows.into_iter() + .filter_map( + |(account_id, table_name, ttl_attribute, stream_spec_json)| match stream_enabled( + stream_spec_json, + ) { + Ok(true) => Some(Ok((account_id, table_name, ttl_attribute))), + Ok(false) => None, + Err(e) => Some(Err(e)), + }, + ) + .collect() + } +} + +impl MetadataEngine for TidbEngine { + fn describe_ttl( + &self, + account_id: &str, + table_name: &str, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + Box::pin(async move { + let row: Option<(Option,)> = sqlx::query_as( + "SELECT ttl_attribute FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (ttl_attr,) = row.ok_or_else(|| StorageError::TableNotFound(table_name.clone()))?; + + Ok(match ttl_attr { + Some(attr) => TimeToLiveDescription { + time_to_live_status: TimeToLiveStatus::Enabled, + attribute_name: Some(attr), + }, + None => TimeToLiveDescription { + time_to_live_status: TimeToLiveStatus::Disabled, + attribute_name: None, + }, + }) + }) + } + + fn update_ttl( + &self, + account_id: &str, + table_name: &str, + attribute_name: &str, + enabled: bool, + ) -> BoxFuture<'_, Result<(), StorageError>> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + let attribute_name = attribute_name.to_string(); + Box::pin(async move { + Self::validate_account_id(&account_id)?; + let row: Option<(String, Option, String)> = sqlx::query_as( + "SELECT table_id, stream_specification, table_status \ + FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let Some((table_id, stream_spec_json, status)) = row else { + return Err(StorageError::TableNotFound(table_name)); + }; + if status != "ACTIVE" { + return Err(StorageError::TableNotActive(table_name)); + } + + if enabled { + let use_native_ttl = configure_ttl_artifacts( + &self.data_pool, + &table_id, + &attribute_name, + stream_spec_json, + ) + .await?; + + sqlx::query( + "UPDATE tables SET ttl_attribute = ?, ttl_index_ready = TRUE, \ + ttl_native_enabled = ? \ + WHERE account_id = ? AND table_name = ? AND table_status = 'ACTIVE'", + ) + .bind(&attribute_name) + .bind(use_native_ttl) + .bind(&account_id) + .bind(&table_name) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } else { + drop_ttl_artifacts(&self.data_pool, &table_id).await?; + + sqlx::query( + "UPDATE tables SET ttl_attribute = NULL, ttl_index_ready = FALSE, \ + ttl_native_enabled = FALSE \ + WHERE account_id = ? AND table_name = ? AND table_status = 'ACTIVE'", + ) + .bind(&account_id) + .bind(&table_name) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + Ok(()) + }) + } + + fn tag_resource(&self, arn: &str, tags: &[Tag]) -> BoxFuture<'_, Result<(), StorageError>> { + let arn = arn.to_string(); + let tags = tags.to_vec(); + Box::pin(async move { + for tag in &tags { + sqlx::query( + "INSERT INTO tags (resource_arn, tag_key, tag_value) VALUES (?, ?, ?) \ + ON DUPLICATE KEY UPDATE tag_value = VALUES(tag_value)", + ) + .bind(&arn) + .bind(&tag.key) + .bind(&tag.value) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + Ok(()) + }) + } + + fn untag_resource( + &self, + arn: &str, + tag_keys: &[String], + ) -> BoxFuture<'_, Result<(), StorageError>> { + let arn = arn.to_string(); + let tag_keys = tag_keys.to_vec(); + Box::pin(async move { + for key in &tag_keys { + sqlx::query("DELETE FROM tags WHERE resource_arn = ? AND tag_key = ?") + .bind(&arn) + .bind(key) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + Ok(()) + }) + } + + fn list_tags(&self, arn: &str) -> BoxFuture<'_, Result, StorageError>> { + let arn = arn.to_string(); + Box::pin(async move { + let rows: Vec<(String, String)> = sqlx::query_as( + "SELECT tag_key, tag_value FROM tags WHERE resource_arn = ? ORDER BY tag_key", + ) + .bind(&arn) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(rows + .into_iter() + .map(|(key, value)| Tag { key, value }) + .collect()) + }) + } + + fn tables_with_ttl( + &self, + account_id: &str, + ) -> BoxFuture<'_, Result, StorageError>> { + let account_id = account_id.to_string(); + Box::pin(async move { + let rows: Vec<(String, String)> = sqlx::query_as( + "SELECT table_name, ttl_attribute FROM tables \ + WHERE account_id = ? AND ttl_attribute IS NOT NULL AND table_status = 'ACTIVE'", + ) + .bind(&account_id) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(rows) + }) + } + + fn refresh_table_size( + &self, + account_id: &str, + table_name: &str, + ) -> BoxFuture<'_, Result<(), StorageError>> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + Box::pin(async move { + Self::validate_account_id(&account_id)?; + let (table_id,): (String,) = sqlx::query_as( + "SELECT table_id FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_one(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let data_table = data::data_table_name(&table_id); + let raw_table = data_table.trim_matches('`'); + let (item_count, table_size): (i64, i64) = sqlx::query_as( + "SELECT COALESCE(TABLE_ROWS, 0), COALESCE(DATA_LENGTH, 0) \ + FROM information_schema.tables \ + WHERE table_schema = DATABASE() AND table_name = ?", + ) + .bind(raw_table) + .fetch_optional(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))? + .unwrap_or((0, 0)); + + sqlx::query( + "UPDATE tables SET item_count = ?, table_size_bytes = ? \ + WHERE account_id = ? AND table_name = ? AND table_status = 'ACTIVE'", + ) + .bind(item_count) + .bind(table_size) + .bind(&account_id) + .bind(&table_name) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) + }) + } + + fn list_active_table_names( + &self, + account_id: &str, + ) -> BoxFuture<'_, Result, StorageError>> { + let account_id = account_id.to_string(); + Box::pin(async move { + let rows: Vec<(String,)> = sqlx::query_as( + "SELECT table_name FROM tables WHERE account_id = ? AND table_status = 'ACTIVE' ORDER BY table_name", + ) + .bind(&account_id) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(rows.into_iter().map(|(n,)| n).collect()) + }) + } + + fn all_tables_with_ttl( + &self, + ) -> BoxFuture<'_, Result, StorageError>> { + Box::pin(async move { + let rows: Vec<(String, String, String)> = sqlx::query_as( + "SELECT account_id, table_name, ttl_attribute FROM tables \ + WHERE ttl_attribute IS NOT NULL AND table_status = 'ACTIVE'", + ) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(rows) + }) + } + + fn all_tables_with_ttl_index_ready( + &self, + ) -> BoxFuture<'_, Result, StorageError>> { + Box::pin(async move { + let rows: Vec<(String, String, String)> = sqlx::query_as( + "SELECT account_id, table_name, ttl_attribute FROM tables \ + WHERE ttl_attribute IS NOT NULL AND ttl_index_ready = TRUE AND table_status = 'ACTIVE'", + ) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(rows) + }) + } + + fn create_ttl_index( + &self, + account_id: &str, + table_name: &str, + ttl_attribute: &str, + ) -> BoxFuture<'_, Result<(), StorageError>> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + let ttl_attribute = ttl_attribute.to_string(); + Box::pin(async move { + Self::validate_account_id(&account_id)?; + let row: Option = sqlx::query_as( + "SELECT table_id, ttl_attribute, stream_specification, ttl_index_ready \ + FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (table_id, catalog_ttl_attribute, stream_spec_json, index_ready) = + row.ok_or_else(|| StorageError::TableNotFound(table_name.clone()))?; + if index_ready && catalog_ttl_attribute.as_deref() == Some(ttl_attribute.as_str()) { + return Ok(()); + } + + let use_native_ttl = configure_ttl_artifacts( + &self.data_pool, + &table_id, + &ttl_attribute, + stream_spec_json, + ) + .await?; + + sqlx::query( + "UPDATE tables SET ttl_index_ready = TRUE, ttl_native_enabled = ? \ + WHERE account_id = ? AND table_name = ?", + ) + .bind(use_native_ttl) + .bind(&account_id) + .bind(&table_name) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) + }) + } + + fn drop_ttl_index( + &self, + account_id: &str, + table_name: &str, + ) -> BoxFuture<'_, Result<(), StorageError>> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + Box::pin(async move { + Self::validate_account_id(&account_id)?; + let (table_id,): (String,) = sqlx::query_as( + "SELECT table_id FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_one(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + drop_ttl_artifacts(&self.data_pool, &table_id).await?; + + sqlx::query( + "UPDATE tables SET ttl_index_ready = FALSE, ttl_native_enabled = FALSE \ + WHERE account_id = ? AND table_name = ?", + ) + .bind(&account_id) + .bind(&table_name) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) + }) + } + + fn find_expired_items_indexed( + &self, + account_id: &str, + table_name: &str, + _ttl_attribute: &str, + limit: usize, + ) -> BoxFuture<'_, Result, StorageError>> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + Box::pin(async move { + Self::validate_account_id(&account_id)?; + let (table_id,): (String,) = sqlx::query_as( + "SELECT table_id FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(account_id) + .bind(table_name) + .fetch_one(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let data_table = data::data_table_name(&table_id); + + let now_epoch = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + let limit_i64 = i64::try_from(limit).unwrap_or(i64::MAX); + let now_i64 = i64::try_from(now_epoch).unwrap_or(i64::MAX); + let sql = format!( + "SELECT item_data FROM {data_table} \ + WHERE `{TTL_EXPIRES_AT_COLUMN}` IS NOT NULL \ + AND `{TTL_EXPIRES_AT_COLUMN}` <= FROM_UNIXTIME(?) \ + ORDER BY `{TTL_EXPIRES_AT_COLUMN}` \ + LIMIT ?" + ); + let rows: Vec<(serde_json::Value,)> = sqlx::query_as(&sql) + .bind(now_i64) + .bind(limit_i64) + .fetch_all(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + rows.into_iter().map(|(v,)| data::json_to_item(v)).collect() + }) + } + + fn all_active_tables(&self) -> BoxFuture<'_, Result, StorageError>> { + Box::pin(async move { + let rows: Vec<(String, String)> = sqlx::query_as( + "SELECT account_id, table_name FROM tables \ + WHERE table_status = 'ACTIVE' ORDER BY account_id, table_name", + ) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(rows) + }) + } +} diff --git a/crates/storage-tidb/src/migrations.rs b/crates/storage-tidb/src/migrations.rs new file mode 100755 index 0000000..1d410d3 --- /dev/null +++ b/crates/storage-tidb/src/migrations.rs @@ -0,0 +1,186 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! TiDB schema migration helpers for catalog and data databases. + +use extenddb_storage::management_store::{OpError, OpResult}; +use sqlx::MySqlPool; + +/// Embedded catalog migration files, applied in order. +pub(crate) const CATALOG_MIGRATIONS: &[(&str, &str)] = &[ + ( + "001_schema.sql", + include_str!("../../storage-tidb/migrations/001_schema.sql"), + ), + ( + "002_backup_metadata_fidelity.sql", + include_str!("../../storage-tidb/migrations/002_backup_metadata_fidelity.sql"), + ), + ( + "003_drop_catalog_stream_data.sql", + include_str!("../../storage-tidb/migrations/003_drop_catalog_stream_data.sql"), + ), + ( + "004_control_plane_leases.sql", + include_str!("../../storage-tidb/migrations/004_control_plane_leases.sql"), + ), + ( + "006_native_ttl_mode.sql", + include_str!("../../storage-tidb/migrations/006_native_ttl_mode.sql"), + ), + ( + "007_native_br_backups.sql", + include_str!("../../storage-tidb/migrations/007_native_br_backups.sql"), + ), + ( + "008_native_index_backup_ids.sql", + include_str!("../../storage-tidb/migrations/008_native_index_backup_ids.sql"), + ), + ( + "009_catalog_native_ttl.sql", + include_str!("../../storage-tidb/migrations/009_catalog_native_ttl.sql"), + ), + ( + "010_session_native_ttl.sql", + include_str!("../../storage-tidb/migrations/010_session_native_ttl.sql"), + ), +]; + +/// Run catalog migrations, skipping already-applied ones. +pub(crate) async fn run_catalog_migrations(pool: &MySqlPool) -> OpResult<()> { + println!("--- Running catalog migrations..."); + for (filename, sql) in CATALOG_MIGRATIONS { + if is_migration_applied(pool, filename).await? { + println!(" {filename} — already applied, skipping."); + continue; + } + println!(" Applying {filename}..."); + run_sql_script(pool, sql, filename).await?; + record_migration(pool, filename).await?; + } + println!(" Migrations applied."); + Ok(()) +} + +/// Run data database migrations. +pub(crate) async fn run_data_migrations(pool: &MySqlPool) -> OpResult<()> { + let sql = include_str!("../../storage-tidb/data_migrations/001_data_schema.sql"); + + println!("--- Initializing data database schema..."); + let initialized: bool = sqlx::query_scalar( + "SELECT EXISTS(SELECT 1 FROM information_schema.tables \ + WHERE table_name = 'stream_shards' AND table_schema = DATABASE())", + ) + .fetch_one(pool) + .await + .map_err(|e| OpError::Internal(format!("Check data schema: {e}")))?; + + if initialized { + println!(" Data schema already initialized."); + } else { + run_sql_script(pool, sql, "data migration").await?; + println!(" Data schema initialized."); + } + ensure_stream_shard_sequence(pool).await?; + ensure_data_table_ttl(pool).await?; + Ok(()) +} + +async fn ensure_stream_shard_sequence(pool: &MySqlPool) -> OpResult<()> { + let exists: bool = sqlx::query_scalar( + "SELECT EXISTS(SELECT 1 FROM information_schema.columns \ + WHERE table_name = 'stream_shards' AND table_schema = DATABASE() \ + AND column_name = 'next_sequence_number')", + ) + .fetch_one(pool) + .await + .map_err(|e| OpError::Internal(format!("Check stream shard sequence column: {e}")))?; + + if !exists { + sqlx::query( + "ALTER TABLE stream_shards \ + ADD COLUMN next_sequence_number BIGINT NOT NULL DEFAULT 0", + ) + .execute(pool) + .await + .map_err(|e| OpError::Internal(format!("Add stream shard sequence column: {e}")))?; + } + + sqlx::query( + "UPDATE stream_shards AS ss \ + JOIN ( \ + SELECT shard_id, COALESCE(MAX(CAST(sequence_number AS UNSIGNED)), 0) AS max_seq \ + FROM stream_records \ + GROUP BY shard_id \ + ) AS sr ON sr.shard_id = ss.shard_id \ + SET ss.next_sequence_number = GREATEST(ss.next_sequence_number, sr.max_seq)", + ) + .execute(pool) + .await + .map_err(|e| OpError::Internal(format!("Backfill stream shard sequence counters: {e}")))?; + + Ok(()) +} + +async fn ensure_data_table_ttl(pool: &MySqlPool) -> OpResult<()> { + for statement in [ + "ALTER TABLE stream_records TTL = `created_at` + INTERVAL 24 HOUR TTL_JOB_INTERVAL = '1h'", + "ALTER TABLE idempotency_tokens TTL = `created_at` + INTERVAL 600 SECOND TTL_JOB_INTERVAL = '10m'", + ] { + sqlx::query(statement) + .execute(pool) + .await + .map_err(|e| OpError::Internal(format!("Configure TiDB TTL: {e}")))?; + } + Ok(()) +} + +async fn run_sql_script(pool: &MySqlPool, sql: &str, label: &str) -> OpResult<()> { + for statement in sql.split(';').map(str::trim).filter(|s| !s.is_empty()) { + sqlx::query(statement) + .execute(pool) + .await + .map_err(|e| OpError::Internal(format!("Migration {label} failed: {e}")))?; + } + Ok(()) +} + +/// Check if a table exists in the current database. +pub(crate) async fn table_exists(pool: &MySqlPool, name: &str) -> OpResult { + let exists: bool = sqlx::query_scalar( + "SELECT EXISTS(SELECT 1 FROM information_schema.tables \ + WHERE table_name = ? AND table_schema = DATABASE())", + ) + .bind(name) + .fetch_one(pool) + .await + .map_err(|e| OpError::Internal(format!("Check table exists: {e}")))?; + Ok(exists) +} + +/// Check if a migration has already been applied. +async fn is_migration_applied(pool: &MySqlPool, filename: &str) -> OpResult { + if table_exists(pool, "schema_history").await? { + let applied: (bool,) = + sqlx::query_as("SELECT EXISTS(SELECT 1 FROM schema_history WHERE filename = ?)") + .bind(filename) + .fetch_one(pool) + .await + .map_err(|e| OpError::Internal(format!("Check migration: {e}")))?; + return Ok(applied.0); + } + Ok(false) +} + +/// Record a migration in the `schema_history` table. +async fn record_migration(pool: &MySqlPool, filename: &str) -> OpResult<()> { + if !table_exists(pool, "schema_history").await? { + return Ok(()); + } + sqlx::query("INSERT IGNORE INTO schema_history (filename) VALUES (?)") + .bind(filename) + .execute(pool) + .await + .map_err(|e| OpError::Internal(format!("Record migration: {e}")))?; + Ok(()) +} diff --git a/crates/storage-tidb/src/operations.rs b/crates/storage-tidb/src/operations.rs new file mode 100644 index 0000000..4ff9b63 --- /dev/null +++ b/crates/storage-tidb/src/operations.rs @@ -0,0 +1,68 @@ +// Copyright 2026 DynamoDB Open contributors +// SPDX-License-Identifier: Apache-2.0 + +//! TiDB implementation of `OperationsEngine`. + +use extenddb_storage::error::StorageError; +use extenddb_storage::operations::{ConnectionParts, OperationsEngine}; + +/// TiDB operations engine for ddbo CLI commands. +pub struct TidbOperationsEngine; + +impl OperationsEngine for TidbOperationsEngine { + fn parse_connection_string(&self, s: &str) -> Result { + let parts = crate::config::parse_connection_string(s) + .map_err(|e| StorageError::Internal(e.to_string()))?; + + // Convert ConnParts to ConnectionParts + Ok(ConnectionParts { + host: parts.host, + port: parts.port, + user: parts.user, + password: parts.password, + database: parts.database, + }) + } + + fn redact_connection_string(&self, s: &str) -> String { + crate::config::redact_connection_string(s) + } + + fn validate_identifier(&self, name: &str, label: &str) -> Result<(), StorageError> { + // TiDB identifier validation for format!-based DDL. + // Rejects backticks, null bytes, and non-ASCII characters. + if name.contains('`') { + return Err(StorageError::Internal(format!( + "{label} must not contain backticks" + ))); + } + if name.contains('\0') { + return Err(StorageError::Internal(format!( + "{label} must not contain null bytes" + ))); + } + if !name.is_ascii() { + return Err(StorageError::Internal(format!( + "{label} must contain only ASCII characters" + ))); + } + Ok(()) + } + + fn catalog_version(&self) -> String { + crate::CATALOG_VERSION.to_string() + } + + fn is_sensitive_key(&self, key: &str) -> bool { + let lower = key.to_lowercase(); + [ + "connection_string", + "password", + "secret", + "token", + "encryption_key", + ] + .iter() + .any(|pattern| lower.contains(pattern)) + } +} diff --git a/crates/storage-tidb/src/stream_engine.rs b/crates/storage-tidb/src/stream_engine.rs new file mode 100755 index 0000000..3885b8c --- /dev/null +++ b/crates/storage-tidb/src/stream_engine.rs @@ -0,0 +1,509 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `StreamEngine` trait implementation for `TidbEngine`. + +use extenddb_core::types::{ + SequenceNumberRange, Shard, StreamDescription, StreamRecord, StreamStatus, StreamSummary, + StreamViewType, +}; +use extenddb_storage::StreamEngine; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::{parse_stream_arn, stream_arn}; +use futures::future::BoxFuture; +use sqlx::MySqlPool; + +use crate::TidbEngine; + +/// Number of fixed shards per stream (hash-based assignment). +const SHARDS_PER_STREAM: u32 = 4; + +impl TidbEngine { + pub(crate) fn new_stream_label() -> String { + time::OffsetDateTime::now_utc() + .format(&time::format_description::well_known::Rfc3339) + .unwrap_or_else(|_| time::OffsetDateTime::now_utc().unix_timestamp().to_string()) + } + + /// Ensure the catalog stream label exists while the caller holds the table row. + pub(crate) async fn ensure_stream_label( + tx: &mut sqlx::Transaction<'_, sqlx::MySql>, + account_id: &str, + table_name: &str, + stream_label: Option, + ) -> Result { + let label = stream_label.unwrap_or_else(Self::new_stream_label); + sqlx::query( + "UPDATE tables SET stream_label = COALESCE(stream_label, ?) \ + WHERE account_id = ? AND table_name = ?", + ) + .bind(&label) + .bind(account_id) + .bind(table_name) + .execute(&mut **tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(label) + } + + /// Idempotently create stream shards for a stream-enabled table. + pub(crate) async fn ensure_stream_shard_rows( + data_pool: &MySqlPool, + table_id: &str, + ) -> Result<(), StorageError> { + let mut data_tx = data_pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + for i in 0..SHARDS_PER_STREAM { + let shard_id = format!("shardId-{table_id}-{i:012}"); + let start_seq = format!("{:021}", 0); + sqlx::query( + "INSERT IGNORE INTO stream_shards \ + (shard_id, table_id, starting_sequence_number) \ + VALUES (?, ?, ?)", + ) + .bind(&shard_id) + .bind(table_id) + .bind(&start_seq) + .execute(&mut *data_tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + data_tx + .commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(()) + } +} + +impl StreamEngine for TidbEngine { + fn write_stream_record( + &self, + account_id: &str, + record: &StreamRecord, + shard_id: &str, + table_name: &str, + ) -> BoxFuture<'_, Result<(), StorageError>> { + let account_id = account_id.to_string(); + let record = record.clone(); + let shard_id = shard_id.to_string(); + let table_name = table_name.to_string(); + Box::pin(async move { + let record_json = + serde_json::to_value(&record).map_err(|e| StorageError::Internal(e.to_string()))?; + + let table_id: String = sqlx::query_scalar( + "SELECT table_id FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_one(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + sqlx::query( + "INSERT INTO stream_records (sequence_number, shard_id, table_id, event_name, record_data) \ + VALUES (?, ?, ?, ?, ?)", + ) + .bind(&record.dynamodb.sequence_number) + .bind(&shard_id) + .bind(&table_id) + .bind(format!("{:?}", record.event_name)) + .bind(&record_json) + .execute(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + Ok(()) + }) + } + + fn get_stream_records( + &self, + shard_id: &str, + after_sequence: Option<&str>, + limit: i64, + ) -> BoxFuture<'_, Result<(Vec, Option), StorageError>> { + let shard_id = shard_id.to_string(); + let after_sequence = after_sequence.map(|s| s.to_string()); + Box::pin(async move { + let rows: Vec<(serde_json::Value,)> = if let Some(after) = after_sequence { + sqlx::query_as( + "SELECT record_data FROM stream_records \ + WHERE shard_id = ? AND sequence_number > ? \ + ORDER BY sequence_number LIMIT ?", + ) + .bind(&shard_id) + .bind(&after) + .bind(limit) + .fetch_all(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))? + } else { + sqlx::query_as( + "SELECT record_data FROM stream_records \ + WHERE shard_id = ? \ + ORDER BY sequence_number LIMIT ?", + ) + .bind(&shard_id) + .bind(limit) + .fetch_all(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))? + }; + + let records: Vec = rows + .into_iter() + .map(|(data,)| { + serde_json::from_value(data).map_err(|e| StorageError::Internal(e.to_string())) + }) + .collect::, _>>()?; + + let last_seq = records.last().map(|r| r.dynamodb.sequence_number.clone()); + Ok((records, last_seq)) + }) + } + + fn describe_stream( + &self, + account_id: &str, + input: &extenddb_core::types::DescribeStreamInput, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + let stream_arn = input.stream_arn.clone(); + let limit = input.limit; + let exclusive_start_shard_id = input.exclusive_start_shard_id.clone(); + Box::pin(async move { + let (table_name, stream_label) = parse_stream_arn(&stream_arn)?; + + let row: Option<(serde_json::Value, serde_json::Value, Option, String, String)> = + sqlx::query_as( + "SELECT key_schema, attribute_definitions, stream_specification, table_status, table_id \ + FROM tables WHERE account_id = ? AND table_name = ? AND stream_label = ?", + ) + .bind(&account_id) + .bind(&table_name) + .bind(&stream_label) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (ks_json, _ad_json, stream_spec_json, table_status, table_id) = + row.ok_or_else(|| { + StorageError::TableNotFound(format!( + "Requested resource not found: Stream: {arn} not found.", + arn = stream_arn + )) + })?; + + let key_schema = serde_json::from_value(ks_json) + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let stream_view_type = stream_spec_json + .and_then(|v| { + v.get("StreamViewType") + .and_then(|sv| serde_json::from_value::(sv.clone()).ok()) + }) + .unwrap_or(StreamViewType::KeysOnly); + + let limit = limit.unwrap_or(100); + let shard_rows: Vec<(String, Option, String, Option)> = if let Some( + ref start, + ) = + exclusive_start_shard_id + { + sqlx::query_as( + "SELECT shard_id, parent_shard_id, starting_sequence_number, ending_sequence_number \ + FROM stream_shards WHERE table_id = ? AND shard_id > ? \ + ORDER BY shard_id LIMIT ?", + ) + .bind(&table_id) + .bind(start) + .bind(limit + 1) + .fetch_all(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))? + } else { + sqlx::query_as( + "SELECT shard_id, parent_shard_id, starting_sequence_number, ending_sequence_number \ + FROM stream_shards WHERE table_id = ? \ + ORDER BY shard_id LIMIT ?", + ) + .bind(&table_id) + .bind(limit + 1) + .fetch_all(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))? + }; + + #[allow(clippy::cast_sign_loss)] + let limit_usize = limit as usize; + let last_shard = if shard_rows.len() > limit_usize { + Some(shard_rows[limit_usize - 1].0.clone()) + } else { + None + }; + + let shards: Vec = shard_rows + .into_iter() + .take(limit_usize) + .map(|(id, parent, start, end)| Shard { + shard_id: id, + parent_shard_id: parent, + sequence_number_range: SequenceNumberRange { + starting_sequence_number: start, + ending_sequence_number: end, + }, + }) + .collect(); + + let stream_status = if table_status == "DELETING" { + StreamStatus::Disabling + } else { + StreamStatus::Enabled + }; + + Ok(StreamDescription { + stream_arn, + stream_label, + stream_status, + stream_view_type, + table_name, + key_schema, + shards, + last_evaluated_shard_id: last_shard, + }) + }) + } + + fn list_streams( + &self, + account_id: &str, + table_name: Option<&str>, + limit: i64, + exclusive_start_stream_arn: Option<&str>, + ) -> BoxFuture<'_, Result<(Vec, Option), StorageError>> { + let account_id = account_id.to_string(); + let table_name = table_name.map(|s| s.to_string()); + let exclusive_start_stream_arn = exclusive_start_stream_arn.map(|s| s.to_string()); + Box::pin(async move { + let rows: Vec<(String, String, String)> = match ( + table_name.as_deref(), + exclusive_start_stream_arn.as_deref(), + ) { + (Some(tn), Some(start_arn)) => { + let (_, start_label) = parse_stream_arn(start_arn)?; + sqlx::query_as( + "SELECT table_name, table_arn, stream_label FROM tables \ + WHERE account_id = ? AND stream_label IS NOT NULL AND table_name = ? AND stream_label > ? \ + ORDER BY stream_label LIMIT ?", + ) + .bind(&account_id) + .bind(tn) + .bind(&start_label) + .bind(limit + 1) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))? + } + (Some(tn), None) => sqlx::query_as( + "SELECT table_name, table_arn, stream_label FROM tables \ + WHERE account_id = ? AND stream_label IS NOT NULL AND table_name = ? \ + ORDER BY stream_label LIMIT ?", + ) + .bind(&account_id) + .bind(tn) + .bind(limit + 1) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?, + (None, Some(start_arn)) => { + let (start_table, start_label) = parse_stream_arn(start_arn)?; + sqlx::query_as( + "SELECT table_name, table_arn, stream_label FROM tables \ + WHERE account_id = ? AND stream_label IS NOT NULL \ + AND (table_name, stream_label) > (?, ?) \ + ORDER BY table_name, stream_label LIMIT ?", + ) + .bind(&account_id) + .bind(&start_table) + .bind(&start_label) + .bind(limit + 1) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))? + } + (None, None) => sqlx::query_as( + "SELECT table_name, table_arn, stream_label FROM tables \ + WHERE account_id = ? AND stream_label IS NOT NULL \ + ORDER BY table_name, stream_label LIMIT ?", + ) + .bind(&account_id) + .bind(limit + 1) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?, + }; + + #[allow(clippy::cast_sign_loss)] + let limit_usize = limit as usize; + + let summaries: Vec = rows + .iter() + .take(limit_usize) + .map(|(tn, _table_arn, label)| StreamSummary { + stream_arn: stream_arn(&self.region, &account_id, tn, label), + stream_label: label.clone(), + table_name: tn.clone(), + }) + .collect(); + + let last_arn = if rows.len() > limit_usize { + summaries.last().map(|s| s.stream_arn.clone()) + } else { + None + }; + + Ok((summaries, last_arn)) + }) + } + + fn cleanup_expired_stream_records( + &self, + _retention_hours: i64, + ) -> BoxFuture<'_, Result> { + // TiDB native TTL owns stream retention; no duplicate manual delete path. + Box::pin(async move { Ok(0) }) + } + + fn assign_shard( + &self, + account_id: &str, + table_name: &str, + partition_key: &str, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + let partition_key = partition_key.to_string(); + Box::pin(async move { + let table_id: String = sqlx::query_scalar( + "SELECT table_id FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(&account_id) + .bind(&table_name) + .fetch_one(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let shards: Vec<(String,)> = sqlx::query_as( + "SELECT shard_id FROM stream_shards \ + WHERE table_id = ? \ + ORDER BY shard_id", + ) + .bind(&table_id) + .fetch_all(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if shards.is_empty() { + return Err(StorageError::Internal(format!( + "No stream shards for table {table_name}" + ))); + } + + let hash = crc32fast::hash(partition_key.as_bytes()); + #[allow(clippy::cast_possible_truncation)] + let idx = (hash as usize) % shards.len(); + Ok(shards[idx].0.clone()) + }) + } + + fn next_sequence_number(&self, shard_id: &str) -> BoxFuture<'_, Result> { + let shard_id = shard_id.to_string(); + Box::pin(async move { + let mut tx = self + .data_pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let sequence_number = + crate::data::next_shard_sequence_in_tx(&mut tx, &shard_id).await?; + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Ok(sequence_number) + }) + } + + fn validate_shard( + &self, + account_id: &str, + stream_arn: &str, + shard_id: &str, + ) -> BoxFuture<'_, Result<(), StorageError>> { + let account_id = account_id.to_string(); + let stream_arn = stream_arn.to_string(); + let shard_id = shard_id.to_string(); + Box::pin(async move { + let (table_name, stream_label) = parse_stream_arn(&stream_arn)?; + + let table_id: Option = sqlx::query_scalar( + "SELECT table_id FROM tables \ + WHERE account_id = ? AND table_name = ? AND stream_label = ?", + ) + .bind(&account_id) + .bind(&table_name) + .bind(&stream_label) + .fetch_optional(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let Some(table_id) = table_id else { + return Err(StorageError::TableNotFound(format!( + "Requested resource not found: Stream: {stream_arn} not found." + ))); + }; + + let exists: Option<(i32,)> = + sqlx::query_as("SELECT 1 FROM stream_shards WHERE shard_id = ? AND table_id = ?") + .bind(&shard_id) + .bind(&table_id) + .fetch_optional(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if exists.is_none() { + return Err(StorageError::TableNotFound(format!( + "Requested resource not found: Stream: {stream_arn} not found." + ))); + } + Ok(()) + }) + } + + fn latest_sequence_number( + &self, + shard_id: &str, + ) -> BoxFuture<'_, Result, StorageError>> { + let shard_id = shard_id.to_string(); + Box::pin(async move { + let row: Option<(String,)> = sqlx::query_as( + "SELECT sequence_number FROM stream_records \ + WHERE shard_id = ? ORDER BY sequence_number DESC LIMIT 1", + ) + .bind(&shard_id) + .fetch_optional(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + Ok(row.map(|(s,)| s)) + }) + } +} diff --git a/crates/storage-tidb/src/table_engine.rs b/crates/storage-tidb/src/table_engine.rs new file mode 100755 index 0000000..8c8a97e --- /dev/null +++ b/crates/storage-tidb/src/table_engine.rs @@ -0,0 +1,145 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `TableEngine` trait implementation for `TidbEngine`. + +use futures::future::BoxFuture; + +use extenddb_core::types::{ + CreateTableInput, DeleteTableInput, DescribeTableInput, IndexInfo, ListTablesInput, + ListTablesOutput, TableDescription, TableKeyInfo, +}; +use extenddb_storage::TableEngine; +use extenddb_storage::error::StorageError; + +use crate::TidbEngine; + +impl TableEngine for TidbEngine { + fn create_table( + &self, + account_id: &str, + input: CreateTableInput, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + Box::pin(async move { self.create_table_impl(&account_id, input).await }) + } + + fn delete_table( + &self, + account_id: &str, + input: DeleteTableInput, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + Box::pin(async move { self.delete_table_impl(&account_id, input).await }) + } + + fn describe_table( + &self, + account_id: &str, + input: DescribeTableInput, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + Box::pin(async move { + self.build_table_description(&account_id, &input.table_name) + .await + }) + } + + // Note: Real DynamoDB includes tables in CREATING and DELETING states in + // ListTables results. No status filter is applied here intentionally. + fn list_tables( + &self, + account_id: &str, + input: ListTablesInput, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + Box::pin(async move { + let limit = i64::from(input.limit.unwrap_or(100)); + + let rows: Vec<(String,)> = if let Some(ref start) = input.exclusive_start_table_name { + sqlx::query_as( + "SELECT table_name FROM tables WHERE account_id = ? AND table_name > ? ORDER BY table_name COLLATE utf8mb4_bin LIMIT ?", + ) + .bind(&account_id) + .bind(start) + .bind(limit + 1) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))? + } else { + sqlx::query_as( + "SELECT table_name FROM tables WHERE account_id = ? ORDER BY table_name COLLATE utf8mb4_bin LIMIT ?", + ) + .bind(&account_id) + .bind(limit + 1) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))? + }; + + let names: Vec = rows.into_iter().map(|(n,)| n).collect(); + #[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)] + let limit_usize = limit as usize; // Safe: engine clamps limit to [1, 100] + + if names.len() > limit_usize { + Ok(ListTablesOutput { + last_evaluated_table_name: Some(names[limit_usize - 1].clone()), + table_names: names[..limit_usize].to_vec(), + }) + } else { + Ok(ListTablesOutput { + table_names: names, + last_evaluated_table_name: None, + }) + } + }) + } + + fn table_key_info( + &self, + account_id: &str, + table_name: &str, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + Box::pin(async move { self.fetch_table_key_info(&account_id, &table_name).await }) + } + + fn index_info( + &self, + account_id: &str, + table_name: &str, + index_name: &str, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + let table_name = table_name.to_string(); + let index_name = index_name.to_string(); + Box::pin(async move { + self.fetch_index_info(&account_id, &table_name, &index_name) + .await + }) + } + + fn index_info_by_table_id( + &self, + table_id: &str, + index_name: &str, + ) -> BoxFuture<'_, Result> { + let table_id = table_id.to_string(); + let index_name = index_name.to_string(); + Box::pin(async move { + self.fetch_index_info_by_table_id(&table_id, &index_name) + .await + }) + } + + // REQ-CTRL-003: UpdateTable — billing mode, throughput, deletion protection, GSI create/delete. + fn update_table( + &self, + account_id: &str, + input: extenddb_core::types::UpdateTableInput, + ) -> BoxFuture<'_, Result> { + let account_id = account_id.to_string(); + Box::pin(async move { self.update_table_impl(&account_id, input).await }) + } +} diff --git a/crates/storage-tidb/src/table_helpers.rs b/crates/storage-tidb/src/table_helpers.rs new file mode 100755 index 0000000..61985f5 --- /dev/null +++ b/crates/storage-tidb/src/table_helpers.rs @@ -0,0 +1,230 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Helper types and methods for `TableEngine` operations. + +use extenddb_core::types::{ + BillingMode, BillingModeSummary, GsiDescription, LsiDescription, + ProvisionedThroughputDescription, TableDescription, TableStatus, +}; +use extenddb_storage::error::StorageError; +use extenddb_storage::util::{index_arn, stream_arn}; + +use crate::TidbEngine; +use crate::throughput::zero_provisioned_throughput_description; + +/// Row type for table metadata queries. +#[derive(sqlx::FromRow)] +pub(crate) struct TableRow { + pub table_name: String, + pub key_schema: serde_json::Value, + pub attribute_definitions: serde_json::Value, + pub billing_mode: String, + pub provisioned_throughput: Option, + pub stream_specification: Option, + pub table_status: String, + pub creation_epoch: Option, + pub table_size_bytes: i64, + pub item_count: i64, + pub table_arn: String, + pub table_id: String, + pub deletion_protection_enabled: bool, + pub stream_label: Option, +} + +/// Row type for index metadata queries. +#[derive(sqlx::FromRow)] +pub(crate) struct IndexRow { + pub index_name: String, + pub index_type: String, + pub key_schema: serde_json::Value, + pub projection: serde_json::Value, + pub index_status: String, + pub provisioned_throughput: Option, +} + +impl TidbEngine { + pub(crate) async fn build_table_description( + &self, + account_id: &str, + table_name: &str, + ) -> Result { + let mut tx = self + .pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let row: Option = sqlx::query_as( + r"SELECT table_name, key_schema, attribute_definitions, billing_mode, + provisioned_throughput, stream_specification, table_status, + CAST(UNIX_TIMESTAMP(creation_date_time) AS DOUBLE) as creation_epoch, + table_size_bytes, item_count, table_arn, table_id, + deletion_protection_enabled, stream_label + FROM tables WHERE account_id = ? AND table_name = ?", + ) + .bind(account_id) + .bind(table_name) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let row = row.ok_or_else(|| StorageError::TableNotFound(table_name.to_owned()))?; + + let index_rows: Vec = sqlx::query_as( + r"SELECT index_name, index_type, key_schema, projection, + index_status, provisioned_throughput + FROM indexes WHERE table_id = ?", + ) + .bind(&row.table_id) + .fetch_all(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + self.build_table_description_from_row(account_id, row, index_rows) + } + + pub(crate) fn build_table_description_from_row( + &self, + account_id: &str, + row: TableRow, + index_rows: Vec, + ) -> Result { + let mut gsis: Vec = Vec::new(); + let mut lsis: Vec = Vec::new(); + + for idx in index_rows { + let ks = serde_json::from_value(idx.key_schema) + .map_err(|e| StorageError::Internal(e.to_string()))?; + let proj = serde_json::from_value(idx.projection) + .map_err(|e| StorageError::Internal(e.to_string()))?; + + match idx.index_type.as_str() { + "GSI" => { + let provisioned_throughput = idx + .provisioned_throughput + .map(serde_json::from_value) + .transpose() + .map_err(|e| StorageError::Internal(e.to_string()))? + .unwrap_or_else(zero_provisioned_throughput_description); + + gsis.push(GsiDescription { + index_name: idx.index_name.clone(), + key_schema: ks, + projection: proj, + index_status: idx.index_status, + provisioned_throughput: Some(provisioned_throughput), + index_size_bytes: 0, + item_count: 0, + index_arn: index_arn( + &self.region, + account_id, + &row.table_name, + &idx.index_name, + ), + }); + } + "LSI" => { + lsis.push(LsiDescription { + index_name: idx.index_name.clone(), + key_schema: ks, + projection: proj, + index_size_bytes: 0, + item_count: 0, + index_arn: index_arn( + &self.region, + account_id, + &row.table_name, + &idx.index_name, + ), + }); + } + other => { + return Err(StorageError::Internal(format!( + "unknown index type in database: {other}" + ))); + } + } + } + + let key_schema = serde_json::from_value(row.key_schema) + .map_err(|e| StorageError::Internal(e.to_string()))?; + let attr_defs = serde_json::from_value(row.attribute_definitions) + .map_err(|e| StorageError::Internal(e.to_string()))?; + let stream_spec = row + .stream_specification + .map(serde_json::from_value) + .transpose() + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (rcu, wcu) = match &row.provisioned_throughput { + Some(v) => { + let pt: extenddb_core::types::ProvisionedThroughput = + serde_json::from_value(v.clone()) + .map_err(|e| StorageError::Internal(e.to_string()))?; + (pt.read_capacity_units, pt.write_capacity_units) + } + None => (0, 0), + }; + + let table_status = match row.table_status.as_str() { + "ACTIVE" => TableStatus::Active, + "CREATING" => TableStatus::Creating, + "DELETING" => TableStatus::Deleting, + "UPDATING" => TableStatus::Updating, + other => { + return Err(StorageError::Internal(format!( + "unknown table status in database: {other}" + ))); + } + }; + + let creation_epoch = row.creation_epoch.unwrap_or(0.0); + + let billing_mode_summary = if row.billing_mode == "PAY_PER_REQUEST" { + Some(BillingModeSummary { + billing_mode: BillingMode::PayPerRequest, + last_update_to_pay_per_request_date_time: Some(creation_epoch), + }) + } else { + None + }; + + let latest_stream_arn = row + .stream_label + .as_ref() + .map(|label| stream_arn(&self.region, account_id, &row.table_name, label)); + + Ok(TableDescription { + table_name: row.table_name, + key_schema, + attribute_definitions: attr_defs, + table_status, + creation_date_time: creation_epoch, + table_size_bytes: row.table_size_bytes, + item_count: row.item_count, + table_arn: row.table_arn, + table_id: row.table_id, + provisioned_throughput: ProvisionedThroughputDescription { + read_capacity_units: rcu, + write_capacity_units: wcu, + number_of_decreases_today: 0, + last_increase_date_time: None, + last_decrease_date_time: None, + }, + billing_mode_summary, + global_secondary_indexes: if gsis.is_empty() { None } else { Some(gsis) }, + local_secondary_indexes: if lsis.is_empty() { None } else { Some(lsis) }, + stream_specification: stream_spec, + latest_stream_arn, + latest_stream_label: row.stream_label, + deletion_protection_enabled: row.deletion_protection_enabled, + sse_description: None, + table_class_summary: None, + }) + } +} diff --git a/crates/storage-tidb/src/throughput.rs b/crates/storage-tidb/src/throughput.rs new file mode 100644 index 0000000..19f07b2 --- /dev/null +++ b/crates/storage-tidb/src/throughput.rs @@ -0,0 +1,35 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +use extenddb_core::types::{ProvisionedThroughput, ProvisionedThroughputDescription}; + +pub(crate) fn provisioned_throughput_description( + throughput: &ProvisionedThroughput, +) -> ProvisionedThroughputDescription { + ProvisionedThroughputDescription { + read_capacity_units: throughput.read_capacity_units, + write_capacity_units: throughput.write_capacity_units, + number_of_decreases_today: 0, + last_increase_date_time: None, + last_decrease_date_time: None, + } +} + +pub(crate) fn provisioned_throughput_from_description( + description: &ProvisionedThroughputDescription, +) -> ProvisionedThroughput { + ProvisionedThroughput { + read_capacity_units: description.read_capacity_units, + write_capacity_units: description.write_capacity_units, + } +} + +pub(crate) fn zero_provisioned_throughput_description() -> ProvisionedThroughputDescription { + ProvisionedThroughputDescription { + read_capacity_units: 0, + write_capacity_units: 0, + number_of_decreases_today: 0, + last_increase_date_time: None, + last_decrease_date_time: None, + } +} diff --git a/crates/storage-tidb/src/tidb_util.rs b/crates/storage-tidb/src/tidb_util.rs new file mode 100755 index 0000000..6e82525 --- /dev/null +++ b/crates/storage-tidb/src/tidb_util.rs @@ -0,0 +1,20 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! Shared TiDB error classification helpers. + +/// Check if a sqlx error is a unique constraint violation (MySQL/TiDB code 1062). +pub(crate) fn is_unique_violation(e: &sqlx::Error) -> bool { + if let sqlx::Error::Database(db_err) = e { + return db_err.kind() == sqlx::error::ErrorKind::UniqueViolation; + } + false +} + +/// Check if a sqlx error is a foreign key violation (MySQL/TiDB code 1451/1452). +pub(crate) fn is_fk_violation(e: &sqlx::Error) -> bool { + if let sqlx::Error::Database(db_err) = e { + return db_err.kind() == sqlx::error::ErrorKind::ForeignKeyViolation; + } + false +} diff --git a/crates/storage-tidb/src/ttl_worker.rs b/crates/storage-tidb/src/ttl_worker.rs new file mode 100644 index 0000000..95ef524 --- /dev/null +++ b/crates/storage-tidb/src/ttl_worker.rs @@ -0,0 +1,184 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! TTL cleanup background worker for TiDB. + +use std::sync::Arc; +use std::time::Duration; + +use extenddb_core::metrics::MetricsCollector; +use extenddb_core::types::UserIdentity; +use extenddb_storage::error::StorageError; +use extenddb_storage::{DataEngine, MetadataEngine, TableEngine}; + +use crate::TidbEngine; + +const SCAN_INTERVAL: Duration = Duration::from_secs(60); +const BATCH_SIZE: usize = 100; + +/// TTL cleanup worker that periodically scans for and deletes expired items. +pub(crate) async fn ttl_cleanup_worker(storage: Arc, metrics: Arc) { + let region_arc: Arc = Arc::from(storage.region.as_str()); + + loop { + tokio::time::sleep(SCAN_INTERVAL).await; + sweep_expired_items(&storage, &metrics, ®ion_arc).await; + } +} + +async fn sweep_expired_items(storage: &TidbEngine, metrics: &MetricsCollector, region: &Arc) { + let ttl_identity = UserIdentity { + identity_type: "Service".to_owned(), + principal_id: "dynamodb.amazonaws.com".to_owned(), + }; + + let tables = match storage.streaming_ttl_tables_ready().await { + Ok(t) => t, + Err(e) => { + tracing::warn!("TTL worker: failed to list tables: {e}"); + return; + } + }; + + let now_epoch = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + for (account_id, table_name, ttl_attribute) in &tables { + let items = match MetadataEngine::find_expired_items_indexed( + storage, + account_id, + table_name, + ttl_attribute, + BATCH_SIZE, + ) + .await + { + Ok(items) => items, + Err(e) => { + tracing::warn!("TTL worker: find expired failed for {table_name}: {e}"); + continue; + } + }; + + if items.is_empty() { + continue; + } + + let key_info = match TableEngine::table_key_info(storage, account_id, table_name).await { + Ok(ki) => ki, + Err(e) => { + tracing::warn!("TTL worker: key info failed for {table_name}: {e}"); + continue; + } + }; + + let view_type = stream_view_type(&key_info); + let (condition_expr, maps) = build_ttl_condition(ttl_attribute, now_epoch); + + let mut deleted = 0usize; + for item in &items { + let staleness = item + .get(ttl_attribute.as_str()) + .and_then(|av| { + if let extenddb_core::types::AttributeValue::N(n) = av { + n.parse::().ok() + } else { + None + } + }) + .map(|ttl_val| now_epoch.saturating_sub(ttl_val)); + + let key: extenddb_core::types::Item = key_info + .key_schema + .iter() + .filter_map(|ks| { + item.get(&ks.attribute_name) + .map(|v| (ks.attribute_name.clone(), v.clone())) + }) + .collect(); + + let return_old = view_type.is_some(); + let stream = view_type.map(|vt| extenddb_storage::StreamCapture { + view_type: vt, + user_identity: Some(ttl_identity.clone()), + region: region.clone(), + }); + match DataEngine::delete_item( + storage, + &key_info, + &key, + return_old, + Some(&condition_expr), + &maps, + stream.as_ref(), + ) + .await + { + Err(StorageError::ConditionFailed(_)) => {} + Err(e) => { + tracing::warn!("TTL worker: delete failed for {table_name}: {e}"); + } + Ok(_old_item) => { + deleted += 1; + metrics.record_ttl_deletion(table_name); + if let Some(s) = staleness { + #[allow(clippy::cast_precision_loss)] + metrics.record_ttl_staleness(table_name, s as f64); + } + } + } + } + + if deleted > 0 { + tracing::info!("TTL worker: deleted {deleted} expired items from {table_name}"); + } + } +} + +fn stream_view_type( + key_info: &extenddb_core::types::TableKeyInfo, +) -> Option { + key_info.stream_specification.as_ref().and_then(|spec| { + if spec.stream_enabled { + spec.stream_view_type + } else { + None + } + }) +} + +fn build_ttl_condition( + ttl_attribute: &str, + now_epoch: u64, +) -> ( + extenddb_core::expression::Expr, + extenddb_core::expression::ExpressionMaps, +) { + use extenddb_core::expression::{CompareOp, Expr, ExpressionMaps, PathElement}; + use std::collections::HashMap; + + let ttl_path = vec![PathElement::Attribute("#ttl".to_owned())]; + let condition_expr = Expr::And( + Box::new(Expr::Function { + name: "attribute_exists".to_owned(), + args: vec![Expr::Path(ttl_path.clone())], + }), + Box::new(Expr::Compare { + left: Box::new(Expr::Path(ttl_path)), + op: CompareOp::Le, + right: Box::new(Expr::Placeholder("now".to_owned())), + }), + ); + + let mut names = HashMap::new(); + names.insert("ttl".to_owned(), ttl_attribute.to_owned()); + let mut values = HashMap::new(); + values.insert( + "now".to_owned(), + extenddb_core::types::AttributeValue::N(now_epoch.to_string()), + ); + + (condition_expr, ExpressionMaps::new(names, values)) +} diff --git a/crates/storage-tidb/src/update_table.rs b/crates/storage-tidb/src/update_table.rs new file mode 100755 index 0000000..755e73b --- /dev/null +++ b/crates/storage-tidb/src/update_table.rs @@ -0,0 +1,308 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `update_table` implementation for `TidbEngine`. + +use extenddb_core::types::{ + BillingMode, ProvisionedThroughput, TableDescription, UpdateTableInput, +}; +use extenddb_storage::error::StorageError; + +use crate::TidbEngine; +use crate::throughput::provisioned_throughput_description; + +impl TidbEngine { + /// Core implementation of `update_table` (REQ-CTRL-003). + pub(crate) async fn update_table_impl( + &self, + account_id: &str, + input: UpdateTableInput, + ) -> Result { + Self::validate_account_id(account_id)?; + let mut tx = self + .pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + // Lock the row and fetch the durable table id used by data artifacts. + let row: Option<(String, String, Option, bool)> = sqlx::query_as( + "SELECT table_status, table_id, ttl_attribute, ttl_native_enabled FROM tables \ + WHERE account_id = ? AND table_name = ? FOR UPDATE", + ) + .bind(account_id) + .bind(&input.table_name) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (status, table_id, ttl_attribute, ttl_native_enabled) = + row.ok_or_else(|| StorageError::TableNotFound(input.table_name.clone()))?; + if status != "ACTIVE" { + return Err(StorageError::TableNotActive(input.table_name.clone())); + } + let has_gsi_updates = input + .global_secondary_index_updates + .as_ref() + .is_some_and(|updates| !updates.is_empty()); + let enables_stream = input + .stream_specification + .as_ref() + .is_some_and(|spec| spec.stream_enabled); + let changes_stream = input.stream_specification.is_some(); + let reconfigures_ttl = changes_stream && ttl_attribute.is_some(); + let has_control_plane_updates = has_gsi_updates || enables_stream || reconfigures_ttl; + let must_disable_native_ttl_before_stream_visible = enables_stream && ttl_native_enabled; + + // No-op rejection: setting same billing mode to PROVISIONED with same + // throughput values is rejected by DynamoDB. This check runs under the + // FOR UPDATE lock to eliminate the TOCTOU race that existed when the + // check was in the engine layer. + if matches!(input.billing_mode, Some(BillingMode::Provisioned)) { + if let Some(ref pt) = input.provisioned_throughput { + let current_row: Option<(Option, Option)> = + sqlx::query_as( + "SELECT billing_mode, provisioned_throughput FROM tables \ + WHERE account_id = ? AND table_name = ?", + ) + .bind(account_id) + .bind(&input.table_name) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if let Some((current_bm, current_pt_opt)) = current_row { + let is_provisioned = + current_bm.as_deref() == Some("PROVISIONED") || current_bm.is_none(); + let current_pt: Option = current_pt_opt + .map(serde_json::from_value) + .transpose() + .map_err(|e| StorageError::Internal(e.to_string()))?; + let (current_rcu, current_wcu) = current_pt.as_ref().map_or((0, 0), |pt| { + (pt.read_capacity_units, pt.write_capacity_units) + }); + + if is_provisioned + && current_rcu == pt.read_capacity_units + && current_wcu == pt.write_capacity_units + { + return Err(StorageError::NoOpUpdate(format!( + "The provisioned throughput for the table will not change. \ + The requested value equals the current value. \ + Current ReadCapacityUnits provisioned for the table: {}. \ + Requested ReadCapacityUnits: {}. \ + Current WriteCapacityUnits provisioned for the table: {}. \ + Requested WriteCapacityUnits: {}.", + current_rcu, + pt.read_capacity_units, + current_wcu, + pt.write_capacity_units + ))); + } + } + } + } + + if has_control_plane_updates { + sqlx::query( + "UPDATE tables SET table_status = 'UPDATING', \ + status_transition_at = CURRENT_TIMESTAMP(6) \ + WHERE account_id = ? AND table_name = ?", + ) + .bind(account_id) + .bind(&input.table_name) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + // Apply billing mode change. + if let Some(bm) = &input.billing_mode { + let bm_str = match bm { + BillingMode::Provisioned => "PROVISIONED", + BillingMode::PayPerRequest => "PAY_PER_REQUEST", + }; + sqlx::query( + "UPDATE tables SET billing_mode = ? WHERE account_id = ? AND table_name = ?", + ) + .bind(bm_str) + .bind(account_id) + .bind(&input.table_name) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + // Apply provisioned throughput change. + if let Some(pt) = &input.provisioned_throughput { + let pt_json = + serde_json::to_value(pt).map_err(|e| StorageError::Internal(e.to_string()))?; + sqlx::query("UPDATE tables SET provisioned_throughput = ? WHERE account_id = ? AND table_name = ?") + .bind(&pt_json) + .bind(account_id) + .bind(&input.table_name) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + // Apply deletion protection change. + if let Some(dp) = input.deletion_protection_enabled { + sqlx::query("UPDATE tables SET deletion_protection_enabled = ? WHERE account_id = ? AND table_name = ?") + .bind(dp) + .bind(account_id) + .bind(&input.table_name) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + // Apply GSI updates (create/delete). + if let Some(updates) = &input.global_secondary_index_updates { + for update in updates { + if let Some(create) = &update.create { + // Check for duplicate index name. + let existing: Option<(String,)> = sqlx::query_as( + "SELECT index_name FROM indexes WHERE table_id = ? AND index_name = ?", + ) + .bind(&table_id) + .bind(&create.index_name) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if existing.is_some() { + return Err(StorageError::IndexAlreadyExists(create.index_name.clone())); + } + + let gsi_ks = serde_json::to_value(&create.key_schema) + .map_err(|e| StorageError::Internal(e.to_string()))?; + let gsi_proj = serde_json::to_value(&create.projection) + .map_err(|e| StorageError::Internal(e.to_string()))?; + let gsi_pt_description = create + .provisioned_throughput + .as_ref() + .map(provisioned_throughput_description); + let gsi_pt = gsi_pt_description + .as_ref() + .map(serde_json::to_value) + .transpose() + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let index_id = uuid::Uuid::new_v4().to_string(); + sqlx::query( + r"INSERT INTO indexes + (table_id, index_name, index_id, index_type, key_schema, projection, + index_status, provisioned_throughput) + VALUES (?, ?, ?, 'GSI', ?, ?, 'CREATING', ?)", + ) + .bind(&table_id) + .bind(&create.index_name) + .bind(&index_id) + .bind(&gsi_ks) + .bind(&gsi_proj) + .bind(&gsi_pt) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + if let Some(delete) = &update.delete { + // Verify the index exists and fetch its index_id. + let existing: Option<(String,)> = sqlx::query_as( + "SELECT index_name FROM indexes \ + WHERE table_id = ? AND index_name = ? AND index_type = 'GSI' \ + AND index_status = 'ACTIVE'", + ) + .bind(&table_id) + .bind(&delete.index_name) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let (_existing_name,) = existing + .ok_or_else(|| StorageError::IndexNotFound(delete.index_name.clone()))?; + + // Hide the index from read/write paths immediately. Metadata + // is deleted only after the TiDB data table is dropped. + sqlx::query( + "UPDATE indexes SET index_status = 'DELETING' \ + WHERE table_id = ? AND index_name = ? AND index_type = 'GSI'", + ) + .bind(&table_id) + .bind(&delete.index_name) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + } + + // Update attribute_definitions on the table if new ones were provided. + if let Some(new_attr_defs) = &input.attribute_definitions { + let ad_json = serde_json::to_value(new_attr_defs) + .map_err(|e| StorageError::Internal(e.to_string()))?; + sqlx::query("UPDATE tables SET attribute_definitions = ? WHERE account_id = ? AND table_name = ?") + .bind(&ad_json) + .bind(account_id) + .bind(&input.table_name) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + } + + // Apply stream specification after user-visible validation has passed. + // Stream metadata must not become visible until the TiDB artifacts needed + // to capture every write are already present. + if let Some(spec) = &input.stream_specification { + if spec.stream_enabled { + Self::ensure_stream_shard_rows(&self.data_pool, &table_id).await?; + } + if must_disable_native_ttl_before_stream_visible { + self.disable_native_ttl_for_table_id(&table_id).await?; + } + + let spec_json = + serde_json::to_value(spec).map_err(|e| StorageError::Internal(e.to_string()))?; + let new_label = spec.stream_enabled.then(Self::new_stream_label); + sqlx::query( + "UPDATE tables SET stream_specification = ?, \ + stream_label = CASE WHEN ? THEN COALESCE(stream_label, ?) ELSE stream_label END \ + WHERE account_id = ? AND table_name = ?", + ) + .bind(&spec_json) + .bind(spec.stream_enabled) + .bind(&new_label) + .bind(account_id) + .bind(&input.table_name) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if reconfigures_ttl { + sqlx::query( + "UPDATE tables SET ttl_index_ready = FALSE, ttl_native_enabled = FALSE \ + WHERE account_id = ? AND table_name = ?", + ) + .bind(account_id) + .bind(&input.table_name) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + } + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let desc = self + .build_table_description(account_id, &input.table_name) + .await?; + if has_control_plane_updates { + self.control_plane_notify.notify_one(); + } + + Ok(desc) + } +} diff --git a/crates/storage-tidb/src/worker_store.rs b/crates/storage-tidb/src/worker_store.rs new file mode 100755 index 0000000..d52ba81 --- /dev/null +++ b/crates/storage-tidb/src/worker_store.rs @@ -0,0 +1,647 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! `WorkerStore` trait implementation and control plane transition processing. + +use futures::future::BoxFuture; + +use extenddb_core::types::{AttributeDefinition, KeySchemaElement, StreamSpecification}; +use extenddb_storage::error::StorageError; +use extenddb_storage::{MetadataEngine, WorkerStore}; + +use crate::TidbEngine; + +type CreatingTableRow = ( + String, + String, + serde_json::Value, + serde_json::Value, + Option, + Option, +); +type CreateIndexRow = (String, serde_json::Value); +type UpdatingTableRow = ( + String, + String, + serde_json::Value, + serde_json::Value, + Option, + Option, + Option, + bool, +); +type PendingIndexRow = (String, String, String, serde_json::Value); + +const CONTROL_PLANE_LEASE_SECONDS: i64 = 60; + +struct CreateReconcilePlan { + table_name: String, + key_schema: Vec, + attr_defs: Vec, + stream_enabled: bool, + indexes: Vec<(String, Vec)>, + token: String, +} + +struct UpdateReconcilePlan { + account_id: String, + table_name: String, + base_key_schema: Vec, + base_attr_defs: Vec, + stream_enabled: bool, + ttl_attribute: Option, + ttl_index_ready: bool, + pending_indexes: Vec, + token: String, +} + +struct PendingIndexPlan { + index_id: String, + index_name: String, + index_status: String, + key_schema: Vec, +} + +struct DeleteReconcilePlan { + table_name: String, + table_arn: String, + table_id: String, + token: String, +} + +fn parse_json( + value: serde_json::Value, + label: &str, +) -> Result { + serde_json::from_value(value) + .map_err(|e| StorageError::Internal(format!("invalid {label}: {e}"))) +} + +impl WorkerStore for TidbEngine { + fn process_control_plane_transitions( + &self, + ) -> BoxFuture<'_, Result, StorageError>> { + Box::pin(async move { + // Delegate to the inherent method. + Self::process_control_plane_transitions(self).await + }) + } +} + +impl TidbEngine { + async fn refresh_control_plane_lease( + &self, + table_id: &str, + token: &str, + ) -> Result<(), StorageError> { + let result = sqlx::query( + "UPDATE tables \ + SET control_plane_lease_until = DATE_ADD(CURRENT_TIMESTAMP(6), INTERVAL ? SECOND) \ + WHERE table_id = ? AND control_plane_token = ?", + ) + .bind(CONTROL_PLANE_LEASE_SECONDS) + .bind(table_id) + .bind(token) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if result.rows_affected() == 0 { + return Err(StorageError::Internal( + "lost TiDB control-plane lease".to_owned(), + )); + } + Ok(()) + } + + async fn drop_table_data_artifacts(&self, table_id: &str) -> Result<(), StorageError> { + sqlx::query("DELETE FROM stream_shards WHERE table_id = ?") + .bind(table_id) + .execute(&self.data_pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + Self::drop_data_table(&self.data_pool, table_id).await?; + Ok(()) + } + + /// Reconcile a CREATING table by creating all TiDB data artifacts from the + /// durable catalog row, then activating the table once its transition time + /// has arrived. + pub(crate) async fn reconcile_table_create( + &self, + table_id: &str, + include_deferred: bool, + ) -> Result, StorageError> { + let token = uuid::Uuid::new_v4().to_string(); + let mut tx = self + .pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let query = if include_deferred { + "SELECT account_id, table_name, key_schema, attribute_definitions, \ + stream_specification, stream_label \ + FROM tables \ + WHERE table_id = ? AND table_status = 'CREATING' \ + AND (control_plane_lease_until IS NULL \ + OR control_plane_lease_until <= CURRENT_TIMESTAMP(6)) \ + FOR UPDATE" + } else { + "SELECT account_id, table_name, key_schema, attribute_definitions, \ + stream_specification, stream_label \ + FROM tables \ + WHERE table_id = ? AND table_status = 'CREATING' \ + AND status_transition_at IS NOT NULL \ + AND (control_plane_lease_until IS NULL \ + OR control_plane_lease_until <= CURRENT_TIMESTAMP(6)) \ + FOR UPDATE" + }; + let row: Option = sqlx::query_as(query) + .bind(table_id) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let Some(( + account_id, + table_name, + key_schema_json, + attr_defs_json, + stream_json, + stream_label, + )) = row + else { + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + return Ok(None); + }; + + let key_schema: Vec = parse_json(key_schema_json, "table key schema")?; + let attr_defs: Vec = + parse_json(attr_defs_json, "table attribute definitions")?; + let stream_spec: Option = stream_json + .map(|v| parse_json(v, "stream specification")) + .transpose()?; + let stream_enabled = stream_spec.as_ref().is_some_and(|spec| spec.stream_enabled); + + if stream_enabled { + Self::ensure_stream_label(&mut tx, &account_id, &table_name, stream_label).await?; + } + + let index_rows: Vec = + sqlx::query_as("SELECT index_id, key_schema FROM indexes WHERE table_id = ?") + .bind(table_id) + .fetch_all(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + let indexes = index_rows + .into_iter() + .map(|(index_id, index_key_schema_json)| { + parse_json(index_key_schema_json, "index key schema") + .map(|index_key_schema| (index_id, index_key_schema)) + }) + .collect::, _>>()?; + + sqlx::query( + "UPDATE tables \ + SET control_plane_token = ?, \ + control_plane_lease_until = DATE_ADD(CURRENT_TIMESTAMP(6), INTERVAL ? SECOND) \ + WHERE table_id = ? AND table_status = 'CREATING'", + ) + .bind(&token) + .bind(CONTROL_PLANE_LEASE_SECONDS) + .bind(table_id) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let plan = CreateReconcilePlan { + table_name, + key_schema, + attr_defs, + stream_enabled, + indexes, + token, + }; + + Self::create_data_table(&self.data_pool, table_id, &plan.key_schema, &plan.attr_defs) + .await?; + self.refresh_control_plane_lease(table_id, &plan.token) + .await?; + + for (index_id, index_key_schema) in &plan.indexes { + Self::create_index_artifacts( + &self.data_pool, + table_id, + index_id, + index_key_schema, + &plan.attr_defs, + &plan.key_schema, + &plan.attr_defs, + ) + .await?; + self.refresh_control_plane_lease(table_id, &plan.token) + .await?; + } + + if plan.stream_enabled { + Self::ensure_stream_shard_rows(&self.data_pool, table_id).await?; + self.refresh_control_plane_lease(table_id, &plan.token) + .await?; + } + + let result = sqlx::query( + "UPDATE tables \ + SET table_status = 'ACTIVE', status_transition_at = NULL, \ + control_plane_token = NULL, control_plane_lease_until = NULL \ + WHERE table_id = ? AND table_status = 'CREATING' \ + AND control_plane_token = ? \ + AND status_transition_at <= CURRENT_TIMESTAMP(6)", + ) + .bind(table_id) + .bind(&plan.token) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if result.rows_affected() > 0 { + Ok(Some(plan.table_name)) + } else { + sqlx::query( + "UPDATE tables \ + SET control_plane_token = NULL, control_plane_lease_until = NULL \ + WHERE table_id = ? AND table_status = 'CREATING' \ + AND control_plane_token = ?", + ) + .bind(table_id) + .bind(&plan.token) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + Ok(None) + } + } + + /// Reconcile an UPDATING table. Pending GSI creates/deletes and stream + /// shard initialization are retried from catalog metadata until complete. + async fn reconcile_table_update(&self, table_id: &str) -> Result, StorageError> { + let token = uuid::Uuid::new_v4().to_string(); + let mut tx = self + .pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let row: Option = sqlx::query_as( + "SELECT account_id, table_name, key_schema, attribute_definitions, \ + stream_specification, stream_label, ttl_attribute, ttl_index_ready \ + FROM tables \ + WHERE table_id = ? AND table_status = 'UPDATING' \ + AND (control_plane_lease_until IS NULL \ + OR control_plane_lease_until <= CURRENT_TIMESTAMP(6)) \ + FOR UPDATE", + ) + .bind(table_id) + .fetch_optional(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let Some(( + account_id, + table_name, + key_schema_json, + attr_defs_json, + stream_json, + stream_label, + ttl_attribute, + ttl_index_ready, + )) = row + else { + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + return Ok(None); + }; + + let base_key_schema: Vec = + parse_json(key_schema_json, "table key schema")?; + let base_attr_defs: Vec = + parse_json(attr_defs_json, "table attribute definitions")?; + let stream_spec: Option = stream_json + .map(|v| parse_json(v, "stream specification")) + .transpose()?; + let stream_enabled = stream_spec.as_ref().is_some_and(|spec| spec.stream_enabled); + + if stream_enabled { + Self::ensure_stream_label(&mut tx, &account_id, &table_name, stream_label).await?; + } + + let pending_indexes: Vec = sqlx::query_as( + "SELECT index_id, index_name, index_status, key_schema \ + FROM indexes \ + WHERE table_id = ? AND index_type = 'GSI' \ + AND index_status IN ('CREATING', 'DELETING') \ + ORDER BY index_name", + ) + .bind(table_id) + .fetch_all(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let pending_indexes = pending_indexes + .into_iter() + .map(|(index_id, index_name, index_status, key_schema_json)| { + let key_schema = parse_json(key_schema_json, "index key schema")?; + Ok(PendingIndexPlan { + index_id, + index_name, + index_status, + key_schema, + }) + }) + .collect::, StorageError>>()?; + + sqlx::query( + "UPDATE tables \ + SET control_plane_token = ?, \ + control_plane_lease_until = DATE_ADD(CURRENT_TIMESTAMP(6), INTERVAL ? SECOND) \ + WHERE table_id = ? AND table_status = 'UPDATING'", + ) + .bind(&token) + .bind(CONTROL_PLANE_LEASE_SECONDS) + .bind(table_id) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let plan = UpdateReconcilePlan { + account_id, + table_name, + base_key_schema, + base_attr_defs, + stream_enabled, + ttl_attribute, + ttl_index_ready, + pending_indexes, + token, + }; + + if plan.stream_enabled { + Self::ensure_stream_shard_rows(&self.data_pool, table_id).await?; + self.refresh_control_plane_lease(table_id, &plan.token) + .await?; + } + + if let Some(ttl_attribute) = &plan.ttl_attribute + && !plan.ttl_index_ready + { + MetadataEngine::create_ttl_index( + self, + &plan.account_id, + &plan.table_name, + ttl_attribute, + ) + .await?; + self.refresh_control_plane_lease(table_id, &plan.token) + .await?; + } + + for pending in &plan.pending_indexes { + match pending.index_status.as_str() { + "CREATING" => { + Self::create_index_artifacts( + &self.data_pool, + table_id, + &pending.index_id, + &pending.key_schema, + &plan.base_attr_defs, + &plan.base_key_schema, + &plan.base_attr_defs, + ) + .await?; + self.refresh_control_plane_lease(table_id, &plan.token) + .await?; + + sqlx::query( + "UPDATE indexes SET index_status = 'ACTIVE' \ + WHERE table_id = ? AND index_id = ? AND index_status = 'CREATING'", + ) + .bind(table_id) + .bind(&pending.index_id) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + "DELETING" => { + Self::drop_index_artifacts( + &self.data_pool, + table_id, + &pending.index_id, + &pending.key_schema, + &plan.base_attr_defs, + ) + .await?; + self.refresh_control_plane_lease(table_id, &plan.token) + .await?; + sqlx::query( + "DELETE FROM indexes \ + WHERE table_id = ? AND index_id = ? AND index_status = 'DELETING'", + ) + .bind(table_id) + .bind(&pending.index_id) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + other => { + return Err(StorageError::Internal(format!( + "unknown pending GSI status for {}: {other}", + pending.index_name + ))); + } + } + } + + let result = sqlx::query( + "UPDATE tables \ + SET table_status = 'ACTIVE', status_transition_at = NULL, \ + control_plane_token = NULL, control_plane_lease_until = NULL \ + WHERE table_id = ? AND table_status = 'UPDATING' \ + AND control_plane_token = ?", + ) + .bind(table_id) + .bind(&plan.token) + .execute(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if result.rows_affected() > 0 { + Ok(Some(plan.table_name)) + } else { + Ok(None) + } + } + + /// Process pending control plane transitions. + /// + /// Tables in CREATING state whose `status_transition_at` has passed have + /// their TiDB data artifacts created and are moved to ACTIVE. Tables in + /// UPDATING state reconcile pending GSI/stream work before returning to + /// ACTIVE. Tables in DELETING state whose transition time has passed are + /// removed (along with their indexes and tags). + /// + /// Called by the background poller in `cmd_serve`. Also called at startup + /// to recover in-flight operations from a previous server instance. + /// + /// Returns a list of `(table_name, transition)` pairs describing what + /// changed, so the caller can log meaningful state-change messages (D-4). + /// + /// # Errors + /// + /// Returns [`StorageError`] if the database is unreachable or a query fails. + pub async fn process_control_plane_transitions( + &self, + ) -> Result, StorageError> { + let mut transitions = Vec::new(); + + // CREATING → ACTIVE, with data artifacts created by the reconciler. + let pending_creates: Vec<(String, String)> = sqlx::query_as( + r"SELECT table_name, table_id FROM tables + WHERE table_status = 'CREATING' + AND status_transition_at <= CURRENT_TIMESTAMP(6) + AND (control_plane_lease_until IS NULL + OR control_plane_lease_until <= CURRENT_TIMESTAMP(6))", + ) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + for (name, table_id) in pending_creates { + if self + .reconcile_table_create(&table_id, false) + .await? + .is_some() + { + transitions.push((name, "CREATING → active")); + } + } + + // UPDATING → ACTIVE after pending GSI and stream artifacts are reconciled. + let pending_updates: Vec<(String, String)> = sqlx::query_as( + r"SELECT table_name, table_id FROM tables + WHERE table_status = 'UPDATING' + AND (status_transition_at IS NULL OR status_transition_at <= CURRENT_TIMESTAMP(6)) + AND (control_plane_lease_until IS NULL + OR control_plane_lease_until <= CURRENT_TIMESTAMP(6))", + ) + .fetch_all(&self.pool) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + for (name, table_id) in pending_updates { + if self.reconcile_table_update(&table_id).await?.is_some() { + transitions.push((name, "UPDATING → active")); + } + } + + // DELETING → remove row (with tags and data table cleanup). + // + // Strategy: SELECT ... FOR UPDATE SKIP LOCKED to make short durable + // claims, commit the catalog transaction, drop TiDB data artifacts, then + // delete catalog metadata in a second short transaction. + let mut tx = self + .pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let candidates: Vec<(String, String, String, String)> = sqlx::query_as( + r"SELECT account_id, table_name, table_arn, table_id FROM tables + WHERE table_status = 'DELETING' AND status_transition_at <= CURRENT_TIMESTAMP(6) + AND (control_plane_lease_until IS NULL + OR control_plane_lease_until <= CURRENT_TIMESTAMP(6)) + FOR UPDATE SKIP LOCKED", + ) + .fetch_all(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let mut drop_info = Vec::new(); + + for (_acct_id, name, arn, table_id) in &candidates { + let token = uuid::Uuid::new_v4().to_string(); + sqlx::query( + "UPDATE tables \ + SET control_plane_token = ?, \ + control_plane_lease_until = DATE_ADD(CURRENT_TIMESTAMP(6), INTERVAL ? SECOND) \ + WHERE table_id = ? AND table_status = 'DELETING'", + ) + .bind(&token) + .bind(CONTROL_PLANE_LEASE_SECONDS) + .bind(table_id) + .execute(&mut *tx) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + drop_info.push(DeleteReconcilePlan { + table_name: name.clone(), + table_arn: arn.clone(), + table_id: table_id.clone(), + token, + }); + } + + tx.commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + for plan in &drop_info { + self.drop_table_data_artifacts(&plan.table_id).await?; + self.refresh_control_plane_lease(&plan.table_id, &plan.token) + .await?; + + let mut finalize = self + .pool + .begin() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + let result = sqlx::query( + "DELETE FROM tables \ + WHERE table_id = ? AND table_status = 'DELETING' \ + AND control_plane_token = ?", + ) + .bind(&plan.table_id) + .bind(&plan.token) + .execute(&mut *finalize) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + if result.rows_affected() > 0 { + sqlx::query("DELETE FROM tags WHERE resource_arn = ?") + .bind(&plan.table_arn) + .execute(&mut *finalize) + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + + transitions.push((plan.table_name.clone(), "DELETING → deleted")); + } + + finalize + .commit() + .await + .map_err(|e| StorageError::Internal(e.to_string()))?; + } + + Ok(transitions) + } +} diff --git a/crates/storage-tidb/src/workers.rs b/crates/storage-tidb/src/workers.rs new file mode 100644 index 0000000..9339fe8 --- /dev/null +++ b/crates/storage-tidb/src/workers.rs @@ -0,0 +1,114 @@ +// Copyright 2026 ExtendDB contributors +// SPDX-License-Identifier: Apache-2.0 + +//! TiDB-specific background workers. + +use std::sync::Arc; +use std::time::Duration; + +use extenddb_core::metrics::MetricsCollector; +use extenddb_storage::MetadataEngine; +use extenddb_storage::management_store::SettingsStore; +use sqlx::MySqlPool; + +use crate::TidbEngine; + +pub(crate) async fn poll_control_plane_transitions( + storage: Arc, + notify: Arc, + settings: Arc, +) { + const ACTIVE_POLL: Duration = Duration::from_secs(1); + const IDLE_TIMEOUT: Duration = Duration::from_secs(60); + const MARGIN_SECS: f64 = 5.0; + + loop { + // Idle: wait for a wake signal or timeout (defensive sweep) + let _ = tokio::time::timeout(IDLE_TIMEOUT, notify.notified()).await; + + // Read control_plane_delay_seconds from settings to compute active window + let delay_secs = read_control_plane_delay(&*settings).await; + let active_window = Duration::from_secs_f64(delay_secs + MARGIN_SECS); + + // Active: poll every second for active_window + let deadline = tokio::time::Instant::now() + active_window; + loop { + match storage.process_control_plane_transitions().await { + Ok(ref t) if t.is_empty() => {} + Ok(transitions) => { + for (name, transition) in &transitions { + tracing::info!("Table '{name}': {transition}"); + } + } + Err(e) => { + tracing::warn!("Control plane transition poll failed: {e}"); + break; + } + } + if tokio::time::Instant::now() >= deadline { + break; + } + tokio::time::sleep(ACTIVE_POLL).await; + } + } +} + +async fn read_control_plane_delay(store: &S) -> f64 { + store + .get_setting("control_plane_delay_seconds") + .await + .ok() + .flatten() + .and_then(|v| v.parse::().ok()) + .filter(|&v| v >= 0.0) + .unwrap_or(0.25) +} + +pub(crate) async fn table_size_refresh_worker(storage: Arc) { + const REFRESH_INTERVAL: Duration = Duration::from_secs(300); + + loop { + tokio::time::sleep(REFRESH_INTERVAL).await; + + let tables = match MetadataEngine::all_active_tables(&*storage).await { + Ok(t) => t, + Err(e) => { + tracing::warn!("Size refresh worker: failed to list tables: {e}"); + continue; + } + }; + + for (account_id, table_name) in &tables { + if let Err(e) = + MetadataEngine::refresh_table_size(&*storage, account_id, table_name).await + { + tracing::warn!("Size refresh worker: failed for {table_name}: {e}"); + } + } + } +} + +pub(crate) async fn pool_metrics_worker( + catalog_pool: MySqlPool, + data_pool: MySqlPool, + metrics: Arc, +) { + const SAMPLE_INTERVAL: Duration = Duration::from_secs(5); + + loop { + tokio::time::sleep(SAMPLE_INTERVAL).await; + + let catalog_size = catalog_pool.size() as usize; + let catalog_idle = catalog_pool.num_idle(); + let data_size = data_pool.size() as usize; + let data_idle = data_pool.num_idle(); + + // Combined pool stats (catalog + data) + let total_active = + (catalog_size.saturating_sub(catalog_idle)) + (data_size.saturating_sub(data_idle)); + let total_idle = catalog_idle + data_idle; + + #[allow(clippy::cast_possible_truncation)] + metrics.record_pool_state(total_active as u32, total_idle as u32); + } +} diff --git a/crates/storage/src/bootstrapper.rs b/crates/storage/src/bootstrapper.rs index 28052a2..a7abd2d 100755 --- a/crates/storage/src/bootstrapper.rs +++ b/crates/storage/src/bootstrapper.rs @@ -3,9 +3,10 @@ //! Bootstrapper storage trait for init/destroy/migrate operations. //! -//! These operations are inherently backend-specific (e.g., `CREATE DATABASE` -//! is PostgreSQL DDL). The trait abstracts the high-level operations so the -//! CLI commands don't depend on a specific storage backend. +//! These operations are inherently backend-specific because database, schema, +//! and role DDL differs across storage engines. The trait abstracts the +//! high-level operations so the CLI commands don't depend on a specific +//! storage backend. use async_trait::async_trait; @@ -27,6 +28,23 @@ pub struct BootstrapConfig { pub data_db: String, } +/// CLI-provided bootstrap overrides after argument parsing. +/// +/// Storage backends merge these typed values with their config-file defaults. +/// The CLI owns spelling, aliases, and `--flag=value` parsing; backend crates +/// should not inspect raw process arguments. +#[derive(Debug, Clone, Default)] +pub struct BootstrapOptions { + pub storage_host: Option, + pub storage_port: Option, + pub admin_user: Option, + pub admin_password: Option, + pub data_db: Option, + pub catalog_db: Option, + pub app_user: Option, + pub app_password: Option, +} + /// Result of a bootstrap admin user creation. #[derive(Debug)] pub struct AdminBootstrapResult { @@ -44,7 +62,8 @@ pub struct AdminBootstrapResult { /// High-level bootstrap operations for storage backends. /// /// Covers the init, destroy, and migrate command paths. Implementations -/// handle backend-specific DDL (e.g., `CREATE DATABASE` for PostgreSQL). +/// handle backend-specific DDL such as creating databases, schemas, users, +/// and grants. #[async_trait] pub trait Bootstrapper: Send + Sync { /// Ensure the application user exists in the storage backend. @@ -117,19 +136,10 @@ use std::pin::Pin; use crate::error::StorageError; /// Factory function type for creating backend-specific bootstrappers. -/// -/// # Parameters -/// -/// * `config_path` - Path and file name of the extenddb configuration file (e.g. "extenddb.toml") -/// * `cli_args` - Raw commandline arguments from `std::env::args().collect` -/// -/// # Returns -/// -/// A pinned future that resolves to either a boxed `Bootstrapper` or a `StorageError`. pub type BootstrapperFactory = fn( String, - Vec, + BootstrapOptions, ) -> Pin, StorageError>> + Send>>; /// Backend bootstrapper registration entry. @@ -149,12 +159,12 @@ inventory::collect!(BackendRegistration); pub async fn create_bootstrapper( backend: &str, config_path: &str, - cli_args: &[String], + options: BootstrapOptions, ) -> Result, StorageError> { for registration in inventory::iter:: { if registration.name == backend { tracing::info!("Found registered backend: {}", backend); - return (registration.factory)(config_path.to_string(), cli_args.to_vec()).await; + return (registration.factory)(config_path.to_string(), options.clone()).await; } } diff --git a/crates/storage/src/config.rs b/crates/storage/src/config.rs index 671f42b..a2e609b 100644 --- a/crates/storage/src/config.rs +++ b/crates/storage/src/config.rs @@ -3,6 +3,31 @@ //! Storage configuration trait and registry for storage backends. +use extenddb_core::limits::LimitsConfig; + +/// Backend-native backup tool configuration. +/// +/// This is intentionally storage-agnostic enough for the server factory layer: +/// concrete backends decide how to interpret the coordinator endpoint and +/// command prefix. Backends that do not expose native physical backups return +/// `None` from [`StorageConfig::native_backup_config`]. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct NativeBackupConfig { + /// Backup executable. + pub binary: Option, + /// Optional subcommand/component inserted after `binary`; set it to an + /// empty string when `binary` is already the native backup executable. + pub component: Option, + /// Cluster coordinator endpoint. + pub coordinator_endpoint: Option, + /// Base URI for snapshot backups. + pub storage_uri: Option, + /// Base URI for log backups / PITR. + pub log_storage_uri: Option, + /// Whether to send object-store credentials to storage nodes. + pub send_credentials_to_storage_nodes: Option, +} + /// Configuration interface for storage backends. /// /// Each backend implements this trait to expose connection parameters @@ -10,8 +35,6 @@ /// knowing the concrete backend type. pub trait StorageConfig: Send + Sync + std::fmt::Debug { /// Backend-specific connection configuration as a string. - /// - /// For PostgreSQL: connection string (postgresql://...) fn connection_config(&self) -> &str; /// Maximum concurrent connections for data operations. @@ -20,6 +43,17 @@ pub trait StorageConfig: Send + Sync + std::fmt::Debug { /// Maximum concurrent connections for catalog/management operations. fn max_catalog_connections(&self) -> u32; + /// Runtime request limits visible to storage backends that must enforce + /// post-mutation invariants. + fn runtime_limits(&self) -> Option<&LimitsConfig> { + None + } + + /// Optional backend-native physical backup configuration. + fn native_backup_config(&self) -> Option { + None + } + /// Clone this config into a boxed trait object. fn clone_box(&self) -> Box; } @@ -35,10 +69,17 @@ impl Clone for Box { /// Takes a TOML table and returns a boxed `StorageConfig` trait object. pub type StorageConfigDeserializer = fn(&toml::Table) -> Result, String>; +/// Default config factory for a registered storage backend. +pub type StorageConfigDefaultFactory = fn() -> Box; + /// Registration entry for a storage config deserializer. pub struct StorageConfigRegistration { pub backend: &'static str, pub deserializer: StorageConfigDeserializer, + pub default_config: StorageConfigDefaultFactory, + /// Default-backend priority. Higher wins; `None` means this backend is + /// never selected as the implicit default. + pub default_priority: Option, } inventory::collect!(StorageConfigRegistration); @@ -58,3 +99,26 @@ pub fn deserialize_storage_config( } Err(format!("Unknown backend: {}", backend)) } + +/// Create the default configuration for a registered storage backend. +pub fn default_storage_config(backend: &str) -> Result, String> { + for reg in inventory::iter:: { + if reg.backend == backend { + return Ok((reg.default_config)()); + } + } + Err(format!("Unknown backend: {}", backend)) +} + +/// Return the registered backend selected as the implicit default. +pub fn default_backend_name() -> Result<&'static str, String> { + let mut defaults: Vec<(u16, &'static str)> = inventory::iter:: + .into_iter() + .filter_map(|reg| reg.default_priority.map(|priority| (priority, reg.backend))) + .collect(); + defaults.sort_by(|left, right| right.0.cmp(&left.0).then_with(|| left.1.cmp(right.1))); + defaults + .first() + .map(|(_, backend)| *backend) + .ok_or_else(|| "No default storage backend registered".to_owned()) +} diff --git a/crates/storage/src/hooks.rs b/crates/storage/src/hooks.rs index e018cca..c63a8f0 100644 --- a/crates/storage/src/hooks.rs +++ b/crates/storage/src/hooks.rs @@ -21,8 +21,8 @@ pub struct WorkerContext { /// Backend-specific runtime hooks for worker spawning and initialization. /// /// Backends implement this trait to spawn workers that are tightly coupled -/// to their implementation details (e.g., PostgreSQL's control plane poller, -/// pool metrics, GSI delay polling). +/// to their implementation details (e.g., control plane pollers, pool metrics, +/// backend-native retention workers). #[async_trait] pub trait ServerRuntimeHooks: Send + Sync { /// Spawn backend-specific workers. @@ -34,7 +34,7 @@ pub trait ServerRuntimeHooks: Send + Sync { /// Get backend-specific info for logging (optional). /// - /// Example: "data_db=ddbo_data" for PostgreSQL + /// Example: "data_db=extenddb_data" fn backend_info(&self) -> Option { None } diff --git a/crates/storage/src/lib.rs b/crates/storage/src/lib.rs index 98dfd5a..991adac 100755 --- a/crates/storage/src/lib.rs +++ b/crates/storage/src/lib.rs @@ -482,10 +482,10 @@ pub trait StreamEngine: Send + Sync { /// Background worker operations that require storage access. /// /// Covers control-plane transition processing and other periodic maintenance -/// tasks that were previously methods on the concrete `PostgresEngine`. +/// tasks that belong to backend engines. pub trait WorkerStore: Send + Sync { /// Process pending control-plane transitions (CREATING → ACTIVE, - /// DELETING → deleted). Returns a list of `(table_name, description)` + /// UPDATING → ACTIVE, DELETING → deleted). Returns a list of `(table_name, description)` /// for each transition that fired. fn process_control_plane_transitions( &self, diff --git a/crates/storage/src/server_components.rs b/crates/storage/src/server_components.rs index d91eb1c..6ab7e5b 100644 --- a/crates/storage/src/server_components.rs +++ b/crates/storage/src/server_components.rs @@ -58,7 +58,15 @@ impl std::fmt::Display for BackendError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::UnknownBackend(b) => { - write!(f, "Unknown backend '{b}'. Available backends: postgres") + let available: Vec<&str> = inventory::iter:: + .into_iter() + .map(|r| r.backend) + .collect(); + write!( + f, + "Unknown backend '{b}'. Available backends: {}", + available.join(", ") + ) } Self::ConnectionFailed { backend, details } => { write!(f, "Failed to connect to {backend}: {details}") @@ -92,7 +100,7 @@ pub type ServerComponentsFactory = /// /// Backends submit this via inventory::submit! to register themselves. pub struct ServerComponentsRegistration { - /// Backend name (e.g., "postgres") + /// Backend name. pub backend: &'static str, /// Factory function that creates the backend components diff --git a/crates/storage/src/util/key.rs b/crates/storage/src/util/key.rs index f5e50b4..3adca49 100755 --- a/crates/storage/src/util/key.rs +++ b/crates/storage/src/util/key.rs @@ -23,8 +23,7 @@ pub enum SortKeyValue { /// For single-attribute keys, returns the value directly (no encoding). /// For multi-attribute keys, uses netstring encoding: each part is encoded as /// `:,` and concatenated. This is provably collision-free -/// regardless of value content, and compatible with PostgreSQL TEXT columns -/// (no null bytes). +/// regardless of value content, and compatible with SQL text columns. pub fn composite_pk_to_text( item: &Item, key_schema: &[KeySchemaElement], diff --git a/docs/design/01-requirements.md b/docs/design/01-requirements.md index 87401c5..53fb55b 100755 --- a/docs/design/01-requirements.md +++ b/docs/design/01-requirements.md @@ -451,6 +451,14 @@ The catalog database stores extenddb metadata: table definitions, indexes, tags, - REQ-PG-007: Support PostgreSQL 14+ - REQ-PG-008: Support optional read replica connection for eventually consistent reads. When `read_replica_url` is configured, `ConsistentRead=false` reads (GetItem, Query, Scan, BatchGetItem) route to the replica pool. All writes and `ConsistentRead=true` reads always use the primary pool. +### 8.3 TiDB Backend + +- REQ-TIDB-001: Use TiDB's MySQL-compatible SQL endpoint through the sqlx MySQL driver +- REQ-TIDB-002: Use TiDB transactions for global consistency across base rows, secondary indexes, streams, and catalog updates +- REQ-TIDB-003: Represent DynamoDB secondary indexes with generated columns and native TiDB secondary indexes; GSI versus LSI is API metadata, not separate physical index classes +- REQ-TIDB-004: Use TiDB native TTL for non-streaming tables and retain an indexed worker path only when DynamoDB Streams REMOVE records must be emitted +- REQ-TIDB-005: Use TiDB BR for native physical backup/restore instead of catalog row-copy backup data + ## 9. Expression Engine Requirements - REQ-EXPR-001: Parse and evaluate `ConditionExpression` (comparisons, functions, logical operators) diff --git a/docs/design/02-high-level-design.md b/docs/design/02-high-level-design.md index ef0dc0e..4ee0ff6 100755 --- a/docs/design/02-high-level-design.md +++ b/docs/design/02-high-level-design.md @@ -296,7 +296,7 @@ sequenceDiagram - If condition: call core's evaluate_condition() against existing item INSIDE the transaction - If condition fails: ROLLBACK, return ConditionFailed { old_item } - Insert/upsert item - - If GSIs exist: update GSI tables (within same transaction) + - If GSIs exist: update backend-specific secondary-index state (PostgreSQL companion tables; TiDB native indexes) - If stream_record provided: INSERT stream record (within same transaction) - COMMIT 9. Calculate consumed capacity (item size → WCU) @@ -305,7 +305,7 @@ sequenceDiagram 12. HTTP layer: add CRC32 header, compress, send response ``` -> **Key invariant:** Condition evaluation, data write, GSI updates, and stream record capture all happen within a single PostgreSQL transaction. This prevents TOCTOU races (another request modifying the item between condition check and write) and ensures atomicity of stream capture. +> **Key invariant:** Condition evaluation, data write, secondary-index maintenance, and stream record capture are transactionally consistent. PostgreSQL performs companion-table updates in the same transaction; TiDB delegates secondary-index maintenance to native TiDB indexes on the base table. This prevents TOCTOU races (another request modifying the item between condition check and write) and ensures atomicity of stream capture. ### 5.1b Write Path (UpdateItem — Additional Detail) @@ -321,7 +321,7 @@ UpdateItem is more complex than PutItem because the storage backend must also ap - Call core's apply_update(actions, &mut item, ctx) → modified item - Validate modified item (size limits, key attributes unchanged) - INSERT/UPDATE the modified item - - If GSIs exist: update GSI tables (within same transaction) + - If GSIs exist: update backend-specific secondary-index state (PostgreSQL companion tables; TiDB native indexes) - If stream_capture provided: construct full StreamRecord (with old_image/new_image based on stream_view_type), INSERT stream record (within same transaction) - COMMIT ``` @@ -360,26 +360,26 @@ extenddb handles concurrent requests through three cooperating layers: The server runs on a tokio multi-thread runtime. Each incoming HTTP request is handled by an independent async task — there is no shared in-memory state on the hot path. Tasks are scheduled cooperatively across OS threads by the tokio work-stealing scheduler. -### 6.2 PostgreSQL Connection Pool (sqlx) +### 6.2 Storage Connection Pool -All database access goes through an sqlx `PgPool`. The pool size is configurable via `storage.postgres.pool_size` in `extenddb.toml` (default: 20). When all connections are in use, new requests queue at the pool level until a connection is returned or the acquire timeout expires. If the timeout expires, the request fails with an internal server error (HTTP 500). +All database access goes through the configured backend's sqlx connection pool. The pool size is configurable via the active storage section in `extenddb.toml` (default: 20). When all connections are in use, new requests queue at the pool level until a connection is returned or the acquire timeout expires. If the timeout expires, the request fails with an internal server error (HTTP 500). -Total connection footprint on the PostgreSQL server: +Total connection footprint on the storage backend: - `pool_size` connections for DynamoDB data operations (shared by all background workers) -- +2 for the management API (separate pool, `max_connections(2)`) -- +1 for the log-level poller (separate pool, `max_connections(1)`) +- +`catalog_pool_size` connections for management/authz/catalog operations +- +worker-specific connections as needed by the backend -So the total is `pool_size + 3`. The default PostgreSQL `max_connections` is 100, which comfortably supports the default pool_size of 20. +Size backend connection limits for `pool_size + catalog_pool_size + worker overhead`. ### 6.3 Row-Level Locking -Read-modify-write operations (UpdateItem, PutItem with conditions, DeleteItem with conditions, TransactWriteItems) use `SELECT ... FOR UPDATE` to acquire a row-level lock inside a PostgreSQL transaction. This ensures: +Read-modify-write operations (UpdateItem, PutItem with conditions, DeleteItem with conditions, TransactWriteItems) use the backend's transaction and row/record locking semantics. PostgreSQL uses `SELECT ... FOR UPDATE`; TiDB uses transactional row locks through its MySQL-compatible SQL layer. This ensures: - **Atomicity:** The condition check and the write happen against the same snapshot. -- **Serialization:** Concurrent updates to the same item are serialized by PostgreSQL's row lock, not by any in-memory mutex. +- **Serialization:** Concurrent updates to the same item are serialized by backend row/record locks, not by any in-memory mutex. - **No TOCTOU races:** Another request cannot modify the item between the condition check and the write. -There is no in-memory locking (no `Mutex`, `RwLock`, or similar) on the data path. All contention is managed by PostgreSQL. +There is no in-memory locking (no `Mutex`, `RwLock`, or similar) on the data path. All contention is managed by the storage backend. ### 6.4 Implications diff --git a/docs/design/03-component-core.md b/docs/design/03-component-core.md index 651ab30..385cb57 100755 --- a/docs/design/03-component-core.md +++ b/docs/design/03-component-core.md @@ -324,7 +324,7 @@ For `UpdateItem`, the storage backend must execute the following steps inside a 5. Call core::expression::apply_update(actions, &mut item, ctx) → modified item 6. Validate modified item (size limits, key attributes unchanged) 7. INSERT/UPDATE the modified item -8. If GSIs exist: update GSI tables (within same transaction) +8. If GSIs exist: update backend-specific secondary-index state 9. If stream_capture provided: construct full StreamRecord (with old_image/new_image based on stream_view_type), INSERT stream record (within same transaction) 10. COMMIT diff --git a/docs/design/04-component-storage.md b/docs/design/04-component-storage.md index ddc8eaa..6cd9f67 100755 --- a/docs/design/04-component-storage.md +++ b/docs/design/04-component-storage.md @@ -3,20 +3,20 @@ **Version:** 3.0 **Date:** 2026-05-19 **Status:** Active -**Crates:** `storage` (traits), `storage-postgres` (PostgreSQL backend) +**Crates:** `storage` (traits), `storage-postgres` (PostgreSQL backend), `storage-tidb` (TiDB backend) ## 1. Purpose The storage layer provides a trait-based abstraction for all persistent data operations. Traits are defined in the `storage` crate with no database-specific dependencies. Backend implementations live in separate crates (e.g., -`storage-postgres`) and register themselves via a factory pattern using the `inventory` crate. +`storage-postgres`, `storage-tidb`) and register themselves via a factory pattern using the `inventory` crate. The trait-based design allows new storage backends to be added by implementing the storage traits and registering a factory function, with no changes needed to the `engine` or `server` crates. The factory pattern enables runtime backend selection based on configuration. -**Current status**: PostgreSQL is the only supported backend. The trait architecture and plugin infrastructure provide -the foundation for future backend implementations. +**Current status**: PostgreSQL is the default backend. TiDB is available as an optional in-tree backend selected with +`storage.backend = "tidb"` when compiled with the `tidb` feature. ## 2. Storage Trait Hierarchy @@ -77,6 +77,9 @@ Backends implement the individual traits, then implement the composite traits wi ```rust impl StorageEngine for PostgresEngine {} impl CatalogStore for PostgresEngine {} + +impl StorageEngine for TidbEngine {} +impl CatalogStore for TidbEngine {} ``` The `engine` crate receives `Arc` for data operations. The `server` crate receives @@ -139,12 +142,17 @@ written atomically with data writes when `stream` is `Some`. **WorkerStore** (background worker operations): - `process_control_plane_transitions` — handles table state transitions - (CREATING → ACTIVE, DELETING → deleted) + (CREATING → ACTIVE, UPDATING → ACTIVE, DELETING → deleted) **BackupEngine** (backup and restore): - `create_backup`, `describe_backup`, `list_backups`, `delete_backup` - `restore_table_from_backup` +Backend implementations own the physical backup data plane. PostgreSQL keeps +its existing implementation. TiDB uses native BR for snapshot data and keeps +only ExtendDB metadata in the catalog; unsupported BR restore shapes are +reported explicitly rather than emulated by item replay. + ```rust pub trait WorkerStore: Send + Sync { fn process_control_plane_transitions( @@ -154,8 +162,8 @@ pub trait WorkerStore: Send + Sync { ``` The `WorkerStore` trait provides operations needed by background workers. The `process_control_plane_transitions` -method handles table state transitions (CREATING → ACTIVE, DELETING → deleted) and is called by the control plane -poller worker. +method handles table state transitions (CREATING → ACTIVE, UPDATING → ACTIVE, DELETING → deleted) and is called by the +control plane poller worker. ## 3. Management and Operational Traits @@ -392,16 +400,19 @@ The PostgreSQL backend uses two categories of tables: attributes, matching the DynamoDB model where key attributes are part of the item. -- **GSI tables**: GSI tables include base table primary key columns (`base_pk`, - `base_sk_*`) as actual SQL columns (not just inside `item_data` JSONB). This - is required because: (1) GSI keys are not unique — two base table items can - project to the same GSI key, so the base table PK is needed for uniqueness; - (2) pagination requires a tiebreaker when GSI keys collide; (3) the base - table PK is needed to look up the full item for projections. +- **Secondary indexes**: Backend implementations use the database-native shape + that preserves DynamoDB key ordering and pagination. PostgreSQL stores GSI + companion tables with base table primary key columns (`base_pk`, `base_sk_*`) + as actual SQL columns. TiDB stores each item once in the base table, exposes + index keys as generated columns over `item_data`, and creates native TiDB + secondary indexes over those generated columns plus the base key tie-breaker. + TiDB has no separate local-index physical path; GSI versus LSI remains + DynamoDB API metadata. -- **GSI consistency**: GSI updates are asynchronous by default (10ms delay) to - match DynamoDB behavior. LSI updates are always synchronous. See §6 for - details on the propagation delay model. +- **GSI consistency**: GSI write consistency is backend-specific. TiDB relies + on native secondary indexes, which are maintained by TiDB from the base row. + PostgreSQL can simulate asynchronous GSI propagation for compatibility + testing. LSI updates are always synchronous. See §6 for details. ### 5.2 Connection Pooling @@ -562,30 +573,29 @@ migrations/ ## 6. GSI Consistency Model -**Decision:** GSI updates are **asynchronous by default** with a configurable -propagation delay. LSI updates are always synchronous. +**Decision:** GSI consistency is backend-specific. TiDB uses native secondary +indexes maintained from the base table write. PostgreSQL can simulate +DynamoDB-style asynchronous GSI propagation with a configurable delay. LSI +updates are always synchronous. **Implementation:** -- Each GSI has an optional `propagation_delay_ms` column in the `indexes` table -- If `propagation_delay_ms` is `NULL` or negative, the system default is used - (default: 10ms, configurable via `gsi_propagation_delay_ms` setting) -- If `propagation_delay_ms` is `0`, the GSI is updated synchronously in the - same transaction as the base table write -- If `propagation_delay_ms` is positive, the GSI update is enqueued and applied - after a random delay within `[0, propagation_delay_ms]` +- TiDB stores each item once and uses generated columns plus native secondary + indexes, leveraging TiDB's globally consistent transaction model. +- PostgreSQL supports `gsi_propagation_delay_ms` for asynchronous compatibility + testing. - LSIs are always synchronous (delay is ignored) to match DynamoDB behavior **Rationale:** -- **Matches DynamoDB semantics**: Real DynamoDB GSIs are eventually consistent - with propagation delays typically in the range of milliseconds to seconds +- **Leverages backend strengths**: TiDB can keep secondary indexes transactionally + consistent without a worker queue. +- **Matches DynamoDB semantics where useful**: PostgreSQL can still simulate + eventually consistent GSI propagation for compatibility testing. - **Surfaces real bugs**: Applications that incorrectly assume immediate GSI - consistency will fail in ExtendDB just as they would in production DynamoDB -- **Configurable**: Can be set to 0ms for synchronous behavior when needed for - testing or specific use cases + consistency can be tested against the PostgreSQL async path. **Trade-off:** Asynchronous GSI updates add complexity (queue, workers, delay -tracking) but provide higher fidelity to DynamoDB behavior. The synchronous -mode (delay=0) is available for applications that need it. +tracking) but provide higher fidelity to DynamoDB behavior. TiDB chooses the +simpler transactional path as the default and only path. ### 6.1 Table Status Enforcement @@ -593,12 +603,13 @@ All data plane operations (PutItem, GetItem, Query, etc.) must check `table_stat not `ACTIVE`, return `StorageError::TableNotActive` (mapped to `ResourceInUseException`). Control plane operations that modify the table (`UpdateTable`, `DeleteTable`) must: -1. Atomically set `table_status` to `UPDATING` or `DELETING` (using `UPDATE ... WHERE table_status = 'ACTIVE'` — if - zero rows affected, the table is already being modified, return `ResourceInUseException`) -2. Perform the operation -3. Set `table_status` back to `ACTIVE` (or remove the row for `DeleteTable`) +1. Atomically move the catalog row out of `ACTIVE` (`UPDATING` or `DELETING`) before publishing any data-side work +2. Persist the pending work in catalog metadata (`indexes.index_status`, stream specification, or table deletion state) +3. Let the backend control-plane reconciler create, backfill, drop, or repair data artifacts, then set the table back to + `ACTIVE` or remove the row -This prevents concurrent DDL operations on the same table. +This prevents concurrent DDL operations on the same table and makes crash recovery a normal retry path rather than a +cleanup special case. ### 6.1.1 Async Control Plane Transitions (Phase 1c) @@ -614,8 +625,10 @@ When `NULL`, no transition is pending. (no extra round-trip). - `DeleteTable` sets `table_status = 'DELETING'` with a scheduled transition time. The row, its indexes, and tags are removed when the transition fires. -- A background poller processes pending transitions. `CREATING → ACTIVE` is a single UPDATE; `DELETING → removed` -uses `DELETE ... FOR UPDATE SKIP LOCKED ... RETURNING` for concurrent safety. +- A background poller processes pending transitions. `CREATING → ACTIVE` + creates any missing data artifacts before activation. `UPDATING → ACTIVE` + reconciles pending GSI and stream work. `DELETING → removed` drops data + artifacts before deleting catalog metadata. - On startup, `process_control_plane_transitions()` recovers any in-flight operations from a previous server instance. - A partial index (`idx_tables_pending_transition ON tables @@ -625,8 +638,7 @@ uses `DELETE ... FOR UPDATE SKIP LOCKED ... RETURNING` for concurrent safety. **Design decisions and future direction (from Phase 1c human review):** - The single-column approach works because each table has exactly one pending status transition at a time. Index-level -transitions (e.g., GSI backfill) will need a separate `status_transition_at` -on the `indexes` table when GSI operations are implemented. +transitions are represented by `indexes.index_status` while the parent table is `UPDATING`. - The poller interval will be increased to 10 seconds at idle, with control plane operations poking the poller to wake up immediately and backoff appropriately (Phase 2). @@ -647,9 +659,10 @@ for rows where `status_transition_at IS NOT NULL AND status_transition_at <= NOW()` and completes them immediately. Rows where `status_transition_at` is in the future are left for the background poller. -This column-on-tables approach is sufficient while control plane operations map 1:1 to table status changes -(`CREATING → ACTIVE`, `DELETING → removed`). A separate `control_plane_operations` table becomes necessary when: -- Operations span multiple catalog entities (e.g., GSI backfill touches both `indexes` and data tables) +This column-on-tables approach is sufficient while control plane operations are scoped to one table +(`CREATING → ACTIVE`, `UPDATING → ACTIVE`, `DELETING → removed`). A separate `control_plane_operations` table becomes +necessary when: +- Operations span multiple tables or accounts - Operations have intermediate states beyond a single status flip (e.g., multi-step UpdateTable) - Audit or observability requires a history of completed operations, not just pending ones @@ -660,13 +673,14 @@ full crash recovery. When `UpdateTable` adds a new GSI to a table with existing data: -1. Set the new index status to `CREATING` in `indexes` -2. Spawn a background task that scans the base table in batches (configurable batch size, default 1000) -3. For each batch, INSERT the projected attributes into the new GSI table -4. On completion, set index status to `ACTIVE` -5. During backfill, writes to the base table also write to the new GSI table (the write path checks index status and -includes `CREATING` indexes) -6. Queries against a `CREATING` index return `ResourceNotFoundException` (matching DynamoDB behavior) +1. Set the parent table to `UPDATING` and insert the new index with `index_status = 'CREATING'` +2. Commit the catalog transaction so the pending operation is durable +3. The control-plane reconciler performs the backend-native physical work +4. TiDB adds generated columns and a native secondary index; TiDB's online DDL backfills and maintains the index from the + base table, so ExtendDB does not run a separate item-replay backfill +5. PostgreSQL creates and backfills its companion index table +6. On completion, the reconciler marks the index `ACTIVE` and returns the table to `ACTIVE` +7. Queries against a `CREATING` index return `ResourceNotFoundException` (matching DynamoDB behavior) ## 7. Pagination Token Encoding @@ -704,8 +718,9 @@ WHERE (pk = $gsi_pk AND sk_s > $gsi_sk) ``` For GSI queries, the pagination key includes both the GSI key attributes and the base table primary key (needed to -uniquely identify the position, since GSI keys are not unique). This is why the GSI PostgreSQL table includes `base_pk` -and `base_sk_*` as actual columns. +uniquely identify the position, since GSI keys are not unique). Backends must preserve this tie-breaker. PostgreSQL +stores `base_pk` and `base_sk_*` as actual columns in companion GSI tables; TiDB includes the generated base-key columns +in its native secondary indexes. ## 8. Parallel Scan Segment Assignment @@ -754,7 +769,7 @@ CREATE INDEX ON _dynamodb_idempotency_tokens(created_at); 1. Before executing a transaction, check if the token exists 2. If found: return the stored response (idempotent replay) 3. If not found: execute the transaction, store the token + response - atomically in the same PostgreSQL transaction + atomically in the same backend transaction 4. Background cleanup: delete tokens older than 10 minutes (matching DynamoDB's idempotency window) @@ -955,9 +970,13 @@ pub struct WorkerContext { **Backend-specific workers** (spawned via `RuntimeHooks`, access backend internals): -- PostgreSQL spawns 7 workers (control plane poller, pool metrics, GSI delay - poller, TTL cleanup, stream cleanup, idempotency token cleanup, table size - refresh) +- PostgreSQL spawns its control-plane poller, pool metrics, GSI delay poller, + TTL cleanup, stream cleanup, idempotency token cleanup, and table size refresh + workers +- TiDB spawns its control-plane poller, table size refresh, stream-emitting + item TTL cleanup, and pool metrics workers; TiDB native TTL handles + non-streaming item TTL plus stream-record, idempotency-token, metrics, + login-attempt, and assume-role session retention - Other backends may spawn different workers or none at all Example PostgreSQL implementation: diff --git a/docs/design/05-component-auth-v3.md b/docs/design/05-component-auth-v3.md index 0b6598e..6074c0d 100755 --- a/docs/design/05-component-auth-v3.md +++ b/docs/design/05-component-auth-v3.md @@ -12,7 +12,7 @@ The `auth` crate provides pluggable authentication and authorization for extendd The crate depends on `extenddb-core` (for types and errors) and `async_trait` (for object-safe async trait dispatch). It has no HTTP framework or storage dependencies. -> **No caching.** All credential lookups, policy fetches, and identity resolution read directly from Postgres on every request. The latency is sub-millisecond for indexed lookups, and this approach eliminates stale-cache bugs and cache invalidation complexity entirely. Multiple extenddb instances sharing the same catalog see consistent data without coordination. If profiling under production load shows database round-trips are a bottleneck, caching can be added as a transparent layer without changing any interfaces. +> **No caching.** All credential lookups, policy fetches, and identity resolution read directly from the catalog store on every request. The latency is sub-millisecond for indexed lookups, and this approach eliminates stale-cache bugs and cache invalidation complexity entirely. Multiple extenddb instances sharing the same catalog see consistent data without coordination. If profiling under production load shows database round-trips are a bottleneck, caching can be added as a transparent layer without changing any interfaces. ## 2. Key Design Decisions (from 2026-04-15 review session) @@ -297,7 +297,7 @@ CREATE TABLE tables ( ### 6.2 Data Table Naming -Per-table Postgres tables change from `_ddb_{table_name}` to `_ddb_{account_id}_{table_name}` to avoid collisions between accounts. +Per-table backend data tables change from `_ddb_{table_name}` to `_ddb_{account_id}_{table_name}` to avoid collisions between accounts. ### 6.3 Indexes, Streams, Tags @@ -399,7 +399,7 @@ Flow: ## 10. Encryption Key Management -Secret keys are encrypted at rest with AES-256-GCM. The encryption key is generated at `extenddb init` time and stored in the `settings` table. The threat model is: the Postgres database is the trust boundary. If an attacker can read the settings table, they already have access to all data. +Secret keys are encrypted at rest with AES-256-GCM. The encryption key is generated at `extenddb init` time and stored in the `settings` table. The threat model is: the catalog database is the trust boundary. If an attacker can read the settings table, they already have access to all data. ```sql INSERT INTO settings (key, value) VALUES ('encryption_key', ''); diff --git a/docs/design/07-component-streams.md b/docs/design/07-component-streams.md index dbff306..6749477 100755 --- a/docs/design/07-component-streams.md +++ b/docs/design/07-component-streams.md @@ -87,7 +87,7 @@ Store stream records in the same database as the data. This allows atomic writes DynamoDB retains stream records for 24 hours. Options: - **Background worker**: Periodically delete records older than the retention period -- **TTL on the storage side**: PostgreSQL `pg_cron` or application-level scheduled task +- **TTL on the storage side**: Backend-native TTL where available, or an application-level scheduled task **Recommended:** Application-level background task (same pattern as TTL cleanup), configurable retention period (default: 24 hours). diff --git a/docs/design/08-component-config.md b/docs/design/08-component-config.md index 10263ff..53257ef 100755 --- a/docs/design/08-component-config.md +++ b/docs/design/08-component-config.md @@ -49,6 +49,7 @@ Environment variables use double-underscore (`__`) as a nesting separator, prefi | `server.port` | `EXTENDDB__SERVER__PORT` | | `server.bind_addr` | `EXTENDDB__SERVER__BIND_ADDR` | | `storage.postgres.connection_string` | `EXTENDDB__STORAGE__POSTGRES__CONNECTION_STRING` | +| `storage.tidb.connection_string` | `EXTENDDB__STORAGE__TIDB__CONNECTION_STRING` | | `storage.postgres.read_replica_url` | `EXTENDDB__STORAGE__POSTGRES__READ_REPLICA_URL` | | `auth.provider` | `EXTENDDB__AUTH__PROVIDER` | | `auth.encryption_key` | `EXTENDDB__AUTH__ENCRYPTION_KEY` | @@ -105,7 +106,7 @@ global_rps = 0 # 0 = disabled per_table_rps = 0 # 0 = disabled [storage] -backend = "postgres" # "postgres" | future: "sqlite", "mysql" +backend = "postgres" # "postgres" | "tidb" [storage.postgres] connection_string = "postgresql://localhost:5432/extenddb" # Set credentials via env var in production @@ -117,6 +118,18 @@ read_replica_pool_size = 20 # Pool size for the read replica (defaults to pool_ connection_timeout_secs = 5 statement_timeout_secs = 30 +[storage.tidb] +connection_string = "mysql://extenddb:extenddb-local-dev@localhost:4000/extenddb_catalog" +pool_size = 20 +catalog_pool_size = 20 + +[storage.tidb.backup] +pd_endpoint = "127.0.0.1:2379" +storage_uri = "local:///var/lib/extenddb/tidb-backups" +binary = "tiup" +component = "br" +send_credentials_to_tikv = false + [auth] provider = "builtin" # "none" | "builtin" | "aws_iam" | future: "azure_ad" region = "us-east-1" # Region for SigV4 validation diff --git a/docs/design/11-high-availability.md b/docs/design/11-high-availability.md index 6002e49..e7dd4c0 100755 --- a/docs/design/11-high-availability.md +++ b/docs/design/11-high-availability.md @@ -8,7 +8,7 @@ ## 1. Problem Statement -extenddb currently operates as a single-process server backed by a single PostgreSQL instance. The steering documents note a `TODO(architecture)` about enforcing single-frontend-per-catalog vs. designing for multi-instance topology. This design addresses that open question and defines how extenddb scales from a single Raspberry Pi to a petabyte-scale cloud deployment with multiple replicas. +extenddb currently operates as a single-process server backed by one configured storage backend. The steering documents note a `TODO(architecture)` about enforcing single-frontend-per-catalog vs. designing for multi-instance topology. This design addresses that open question and defines how extenddb scales from a single Raspberry Pi to a petabyte-scale cloud deployment with multiple replicas. ### Goals @@ -97,7 +97,7 @@ This is the minimal mechanism needed to honor DynamoDB's consistency model. It w ### Model 1: Single Frontend, Single Catalog (Current) ``` -[Frontend] → [PostgreSQL] +[Frontend] → [Storage Backend] ``` - No HA. Single point of failure. @@ -107,7 +107,7 @@ This is the minimal mechanism needed to honor DynamoDB's consistency model. It w ``` [Frontend A] ─┐ - ├→ [PostgreSQL Primary] + ├→ [Storage Backend] [Frontend B] ─┘ ``` @@ -119,9 +119,9 @@ This is the minimal mechanism needed to honor DynamoDB's consistency model. It w ### Model 3: Multiple Frontends, Replicated Catalog ``` -[Frontend A] ─┐ ┌→ [PostgreSQL Primary] (writes + strong reads) +[Frontend A] ─┐ ┌→ [Storage Primary] (writes + strong reads) ├─────┤ -[Frontend B] ─┘ └→ [PostgreSQL Replica] (eventually consistent reads) +[Frontend B] ─┘ └→ [Storage Replica] (eventually consistent reads) ``` - Full HA for both frontend and catalog. @@ -131,14 +131,14 @@ This is the minimal mechanism needed to honor DynamoDB's consistency model. It w ### Model 4: Multiple Frontends, Natively-Clustered Catalog ``` -[Frontend A] ─┐ ┌→ [Cassandra Node 1] - ├─────┼→ [Cassandra Node 2] -[Frontend B] ─┘ └→ [Cassandra Node 3] +[Frontend A] ─┐ ┌→ [TiDB SQL Node] + ├─────┼→ [TiDB SQL Node] +[Frontend B] ─┘ └→ [TiKV / PD Cluster] ``` -- Storage layer maps DynamoDB consistency to native consistency levels. - - `ConsistentRead = true` → `QUORUM` or `LOCAL_QUORUM` - - `ConsistentRead = false` → `ONE` or `LOCAL_ONE` +- Storage layer maps DynamoDB consistency to backend-native semantics. + - `ConsistentRead = true` → route through the backend's strongly consistent read path + - `ConsistentRead = false` → route through any backend-supported eventually consistent path, or the same strong path if the backend is globally consistent - No separate primary/replica distinction — the storage adapter handles it. - Suitable for: large-scale deployments, multi-datacenter. @@ -204,13 +204,13 @@ Each frontend maintains health checks against its catalog connections. If a repl ### D7: Connection Pool Sizing -With N frontends each maintaining pools to 1 primary + M replicas, total connection count is N × (primary_pool_size + M × replica_pool_size). PostgreSQL's `max_connections` limit (default 100) can be exhausted quickly. Guidance: +With N frontends each maintaining pools to 1 primary + M replicas, total connection count is N × (primary_pool_size + M × replica_pool_size). Backend connection limits can be exhausted quickly. Guidance: - **Small deployments (1-3 frontends):** Direct connections with pool size 5-10 per target. Total: 15-60 connections. -- **Medium deployments (4-10 frontends):** Use PgBouncer or equivalent connection pooler between frontends and catalog. Pool size per frontend: 3-5 per target. -- **Large deployments (10+ frontends):** PgBouncer required. Consider transaction-mode pooling. Document `max_connections` tuning. +- **Medium deployments (4-10 frontends):** Use the backend's recommended connection pooler between frontends and catalog. Pool size per frontend: 3-5 per target. +- **Large deployments (10+ frontends):** External connection pooling is required for backends with strict connection limits. Document backend-specific connection tuning. -The `extenddb.toml` configuration accepts `pool_size` per connection target. The design does not mandate PgBouncer but documents it as a best practice for deployments with more than 3 frontends. +The `extenddb.toml` configuration accepts `pool_size` per connection target. The design does not mandate a specific connection pooler; operators should follow backend-specific best practices for deployments with more than 3 frontends. ## 7. Alternatives Considered @@ -241,7 +241,7 @@ The `extenddb.toml` configuration accepts `pool_size` per connection target. The **Rejected because:** - Violates the No Caching Rule. - Cache invalidation across frontends is the exact problem the No Caching Rule was designed to avoid. -- PostgreSQL's buffer pool already provides memory-resident access to hot data. +- Backend buffer pools already provide memory-resident access to hot data. ### A4: Single-Writer with Read Replicas at Frontend Level @@ -314,7 +314,7 @@ impl ConsistencyLevel { **TransactGetItems:** DynamoDB requires `ConsistentRead = true` for all items in a `TransactGetItems` request. The operation is always strongly consistent. The storage adapter unconditionally routes `TransactGetItems` to the primary connection. This is not configurable — it is a DynamoDB API constraint. -**Breaking change note:** This is a breaking change to the internal `DataEngine` trait. Since the trait is internal and there are no third-party implementations (only `storage-postgres`), no migration path is needed. The change is mechanical: add the parameter to the trait, the implementation, and all call sites in the engine. +**Breaking change note:** This is a breaking change to the internal `DataEngine` trait. Since the trait is internal and current implementations live in-tree (`storage-postgres`, `storage-tidb`), no external migration path is needed. The change is mechanical: add the parameter to the trait, the implementations, and all call sites in the engine. **Alternatives considered and rejected:** @@ -333,19 +333,19 @@ Any SQL statement that acquires locks (`SELECT ... FOR UPDATE`) routes to primar **Requirement:** In the near future, extenddb will support strongly consistent GSIs. A strongly consistent GSI has zero propagation delay — the write to the index commits atomically with the base table write. A strongly consistent read on such a GSI returns data current with the base table. -**Current state:** The storage layer already supports synchronous GSI updates within the write transaction when `propagation_delay_ms = Some(0)` (or when the system default is 0). The `put_item`, `delete_item`, `update_item`, and `transact_write_items` paths all call `insert_index_row_multi`/`delete_index_row_multi` inside the same database transaction for indexes with zero delay. This is the foundation for strongly consistent GSIs. +**Current state:** The storage layer supports backend-specific synchronous GSI maintenance. PostgreSQL can keep companion GSI tables in the write transaction when `propagation_delay_ms = Some(0)` (or when the system default is 0). TiDB stores each item once and relies on native secondary indexes generated from the base table row. **Interaction with HA consistency routing:** -1. **Writes:** A write to a table with a strongly consistent GSI already commits both the base row and all GSI rows in a single PostgreSQL transaction on the primary. No change needed — writes always route to primary. +1. **Writes:** A write to a table with a strongly consistent GSI commits the base row and secondary-index state atomically on the primary. PostgreSQL does this with companion-table writes; TiDB does it through native secondary indexes. No change needed — writes always route to primary. 2. **Strongly consistent GSI reads (`ConsistentRead = true` on a GSI query/scan):** Today, DynamoDB rejects `ConsistentRead = true` on GSI queries with `ValidationException`: "Consistent reads are not supported on global secondary indexes." extenddb faithfully reproduces this rejection (tenet 1). When strongly consistent GSIs are introduced as a extenddb extension, `ConsistentRead = true` on a strongly consistent GSI query routes to the primary — same as any strongly consistent read. The routing logic in §8.2 handles this without modification. 3. **Eventually consistent GSI reads (`ConsistentRead = false` on a GSI query/scan):** Routes to a replica. The replica may have replication lag, so the GSI data on the replica may be slightly behind the primary. This is acceptable — it matches the semantics of eventually consistent reads (the caller explicitly opted into potentially stale data). The GSI data on the replica is guaranteed to be consistent *with itself* (the base row and GSI row committed atomically on the primary, so they replicate together). -4. **Replica consistency guarantee:** Because the base table write and the GSI write commit in the same PostgreSQL transaction, they appear on replicas atomically. A replica never shows a GSI row without the corresponding base row, or vice versa. This is a critical property: PostgreSQL streaming replication replays WAL records in commit order, so a single transaction's effects are visible atomically on replicas. +4. **Replica consistency guarantee:** Because the base table write and secondary-index maintenance commit atomically, they appear on replicas atomically. PostgreSQL streaming replication replays WAL records in commit order, so a single transaction's effects are visible atomically on replicas. TiDB secondary indexes are part of the table's native replicated state. - **Important qualification:** This atomicity guarantee requires **physical streaming replication** (the default for PostgreSQL HA, and what Aurora PostgreSQL uses internally). Logical replication configurations must ensure that base table and GSI tables are replicated through the same subscription with `streaming = on` (not `parallel`). If the subscription uses `streaming = parallel`, transactions may be applied out of order. If the user has multiple subscriptions covering different tables, atomicity across subscriptions is not guaranteed. The design does not support split-subscription logical replication for tables with strongly consistent GSIs. + **Important qualification:** For PostgreSQL companion tables, this atomicity guarantee requires **physical streaming replication** (the default for PostgreSQL HA, and what Aurora PostgreSQL uses internally). Logical replication configurations must ensure that base table and GSI tables are replicated through the same subscription with `streaming = on` (not `parallel`). If the subscription uses `streaming = parallel`, transactions may be applied out of order. If the user has multiple subscriptions covering different tables, atomicity across subscriptions is not guaranteed. The design does not support split-subscription logical replication for tables with strongly consistent GSIs. **Design implications:** @@ -426,7 +426,7 @@ This avoids adding yet another trait that every storage backend must implement. **Scope:** - Verify all operations are safe under concurrent multi-frontend access (the No Caching Rule already ensures this, but explicit verification is needed for: control-plane transitions, TTL worker, GSI backfill worker, stream shard assignment). -- Add distributed locking for background workers (only one frontend runs TTL cleanup, GSI backfill, etc. at a time) using PostgreSQL advisory locks. +- Add distributed locking for background workers (only one frontend runs TTL cleanup, GSI backfill, etc. at a time) using backend-native advisory/session locks or an equivalent lease. - Document load balancer configuration (sticky sessions not required since frontends are stateless). - Add instance-id to metrics and logs for multi-frontend debugging. @@ -445,7 +445,7 @@ With multiple frontends, these workers must not run concurrently on multiple ins ### Solution: Distributed Worker Locks (Global Granularity) -Use PostgreSQL advisory locks (or equivalent per-backend) to ensure only one frontend runs each worker type at a time. **Lock granularity is global (one lock per worker type), not per-table.** +Use backend-native advisory/session locks or an equivalent lease to ensure only one frontend runs each worker type at a time. **Lock granularity is global (one lock per worker type), not per-table.** **Rationale for global locks:** - The current TTL worker iterates all tables with TTL enabled. Per-table locking would require restructuring the worker loop. @@ -491,13 +491,13 @@ impl WorkerType { } ``` -The `WorkerLock` trait follows the same pattern as `DataEngine` and `MetadataEngine` — defined in the `extenddb-storage` crate, implemented by `PostgresEngine`. Since the current architecture uses concrete types (not enum dispatch), `WorkerLock` is simply another trait that `PostgresEngine` implements. +The `WorkerLock` trait follows the same pattern as `DataEngine` and `MetadataEngine` — defined in the `extenddb-storage` crate and implemented by each backend engine. Since the current architecture uses concrete backend types behind trait objects, `WorkerLock` is simply another trait that backend engines implement. Each frontend attempts to acquire the lock on its worker tick interval. If it gets the lock, it runs the worker. If not, it skips. -**Lock lifecycle for PostgreSQL:** `pg_try_advisory_lock(worker_type_id)` with session-level locks. These locks are automatically released when the database connection drops (e.g., frontend crash). This means: -- No explicit TTL/lease mechanism is needed for PostgreSQL. -- A crashed frontend's locks are released when PostgreSQL cleans up the dead connection. +**Lock lifecycle:** Prefer backend-native session locks that are automatically released when the storage connection drops. PostgreSQL can use `pg_try_advisory_lock(worker_type_id)`. TiDB can use a backend-native/session-scoped equivalent or fall back to a catalog lease when a session lock does not satisfy the worker's failure semantics. This means: +- No explicit TTL/lease mechanism is needed when the backend provides crash-released session locks. +- A crashed frontend's locks are released when the backend cleans up the dead connection. - The instance registry heartbeat (§13) is for **observability only**, not for lock management. ## 11. Failure Modes and Recovery @@ -506,7 +506,7 @@ Each frontend attempts to acquire the lock on its worker tick interval. If it ge |---------|--------|----------| | Frontend crash | Requests to that frontend fail. Load balancer routes to others. | Restart frontend. No data loss. | | Replica catalog unavailable | Eventually-consistent reads fall back to primary. | Repair/replace replica. | -| Primary catalog unavailable | Writes and strongly-consistent reads fail with 500. Eventually-consistent reads continue from replicas. | Promote replica to primary (PostgreSQL failover). | +| Primary catalog unavailable | Writes and strongly-consistent reads fail with 500. Eventually-consistent reads continue from replicas. | Promote replica to primary using the backend's failover process. | | Network partition (frontend ↔ catalog) | Affected frontend returns 500. Others continue. | Resolve network issue. | | Split brain (two primaries) | Prevented by catalog's own replication protocol. extenddb does not manage catalog failover. | N/A — delegated to catalog HA. | @@ -525,6 +525,7 @@ Each frontend attempts to acquire the lock on its worker tick interval. If it ge | Backend | Write Leader | Strong Read Leader | Eventually Consistent Read | |---------|-------------|-------------------|---------------------------| | PostgreSQL (streaming replication) | Primary node | Primary node | Any replica | +| TiDB | TiDB transaction coordinator / region leaders | TiDB cluster | TiDB cluster | | Cassandra | Coordinator (any node) | QUORUM nodes | ONE node | | MongoDB (replica set) | Primary member | Primary member | Secondary preferred (falls back to primary if no secondaries available) | | Single PostgreSQL (no replicas) | The single node | The single node | The single node (no distinction) | @@ -544,17 +545,17 @@ Each frontend attempts to acquire the lock on its worker tick interval. If it ge - Mixed storage backends in one deployment ✗ - Frontend configured with replicas but no primary ✗ -- Multiple primaries in PostgreSQL mode ✗ (use catalog's own failover) +- Multiple independent write leaders for one catalog ✗ (use the storage backend's own failover/consensus) ### Startup Validation On startup, each frontend: 1. Connects to the primary catalog and verifies schema version. -2. Connects to each configured replica and verifies it's replicating from the same primary (for PostgreSQL: check `pg_stat_wal_receiver`). +2. Connects to each configured replica, if configured, and verifies it belongs to the same backend topology. 3. Registers itself in a `extenddb_instances` table (instance_id, hostname, started_at, last_heartbeat). 4. Begins heartbeat updates (every 30s). -**Instance registry purpose:** The `extenddb_instances` table is for **observability and operational tooling only**. It answers "which frontends are running?" for operators. It is NOT used for lock management or coordination — PostgreSQL advisory locks (session-scoped, released on disconnect) handle that independently. Dead entries (heartbeat older than 5 minutes, configurable via `extenddb settings set instance_heartbeat_timeout_seconds`) are cleaned up periodically but their presence has no correctness impact. +**Instance registry purpose:** The `extenddb_instances` table is for **observability and operational tooling only**. It answers "which frontends are running?" for operators. It is NOT used for lock management or coordination when the backend provides session-scoped locks; otherwise a dedicated backend lease table owns correctness. Dead entries (heartbeat older than 5 minutes, configurable via `extenddb settings set instance_heartbeat_timeout_seconds`) are cleaned up periodically but their presence has no correctness impact. ## 14. Observability @@ -615,7 +616,7 @@ New metrics for HA monitoring: 2. **Stage 2:** A deployment with 1 frontend + 1 primary + 1 replica correctly routes eventually-consistent reads to the replica and strongly-consistent reads to the primary. Verified by query logs. Strongly consistent GSI reads (`ConsistentRead = true` on a zero-delay GSI) route to primary. Eventually consistent GSI reads route to replica. 3. **Stage 3:** Two frontends sharing a catalog can serve concurrent requests without data corruption. Background workers run on exactly one frontend at a time. Verified by concurrent load test. 4. **Stages 4-5:** Storage backend passes the full test suite with the same pass rate as PostgreSQL. -5. **GSI atomicity invariant:** On any replica, a base table row and its corresponding strongly consistent GSI rows are always visible atomically (never partially). Verified by a test that writes to a table with a strongly consistent GSI, then reads with `ConsistentRead = false` (which routes to a replica), confirming either both the base row and GSI row are visible or neither is. The test must use eventually consistent reads to target the replica — a strongly consistent read would route to primary and not test replica atomicity. +5. **GSI atomicity invariant:** On any replica, a base table row and its corresponding strongly consistent secondary-index state are always visible atomically (never partially). Verified by a test that writes to a table with a strongly consistent GSI, then reads with `ConsistentRead = false` (which routes to a replica), confirming either both the base row and index entry/state are visible or neither is. The test must use eventually consistent reads to target the replica — a strongly consistent read would route to primary and not test replica atomicity. ## 17. Open Questions for Reviewer Deliberation @@ -627,9 +628,9 @@ New metrics for HA monitoring: 4. **Instance registry cleanup:** How long before a stale heartbeat entry is considered dead? **Proposed answer:** 5 minutes, configurable via settings. Dead entries are informational only (advisory locks handle real coordination). -5. **Strongly consistent GSIs on non-PostgreSQL backends:** PostgreSQL guarantees that a single transaction's effects replicate atomically (WAL replay). Cassandra and MongoDB have different transaction semantics. For Cassandra, a logged batch provides atomicity across multiple partition keys (this is the purpose of logged batches), but with significant performance overhead due to batch log coordination on the coordinator and replica nodes. Lightweight transactions (LWT) are not applicable here — they provide compare-and-set semantics for conditional writes, not multi-row atomicity. For MongoDB, multi-document transactions provide the needed atomicity. **Proposed answer:** Each storage backend must guarantee that a base table write and its corresponding strongly consistent GSI writes are atomic. PostgreSQL: single transaction. Cassandra: logged batch (works across partitions, but with performance overhead). MongoDB: multi-document transaction. If a backend cannot provide this guarantee, strongly consistent GSIs are not supported on that backend — this must be surfaced at `CreateTable` time (reject the request), not discovered at write time. +5. **Strongly consistent GSIs on non-PostgreSQL backends:** PostgreSQL guarantees that a single transaction's effects replicate atomically (WAL replay). TiDB's native secondary indexes are part of the base table's transactional state. Other databases may have different transaction semantics. **Proposed answer:** Each storage backend must guarantee that a base table write and its corresponding strongly consistent secondary-index state are atomic. PostgreSQL: single transaction. TiDB: native secondary indexes in the same transaction. If a backend cannot provide this guarantee, strongly consistent GSIs are not supported on that backend — this must be surfaced at `CreateTable` time (reject the request), not discovered at write time. -6. **GSI write amplification under HA:** A table with N strongly consistent GSIs requires N+1 writes (base + N index rows) in a single transaction. With multiple frontends, the primary handles all these writes. Should the design address write amplification concerns? **Proposed answer:** This is an operational consideration, not a design change. Document that strongly consistent GSIs increase write load on the primary proportionally to the number of indexes. Connection pool sizing (D7) should account for this. Additionally, if a single base table item produces multiple GSI rows per index (e.g., a GSI keyed on elements of a list attribute), the transaction size grows further. PostgreSQL handles this well for typical workloads (1-5 GSIs, 1:1 base-to-GSI row mapping), but operators should be aware that large N (many GSIs) or large fan-out (many GSI rows per item) increases transaction duration and row-level lock contention under concurrent writes. +6. **GSI write amplification under HA:** A table with N strongly consistent GSIs increases write work on the primary. PostgreSQL writes companion index rows in the same transaction. TiDB maintains native secondary indexes from the base row, so amplification is handled by TiDB's index maintenance path rather than ExtendDB item replay. **Proposed answer:** This is an operational consideration, not a design change. Document that strongly consistent GSIs increase primary write load proportionally to the number of indexes, with the exact physical cost owned by the backend. ### Resolved Questions diff --git a/docs/design/README.md b/docs/design/README.md index 5ca3a8d..53091e8 100755 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -9,7 +9,7 @@ This directory contains the authoritative design documents for ExtendDB (extendd | 01 | [Requirements](01-requirements.md) | Wire protocol, operations, limits, data types, non-functional requirements | | 02 | [High-Level Design](02-high-level-design.md) | Architecture overview, crate structure, request lifecycle, key design decisions, technology choices | | 03 | [Core](03-component-core.md) | Types, expression engine, validation, capacity calculation, errors (`extenddb-core` crate) | -| 04 | [Storage](04-component-storage.md) | StorageEngine traits, input/output types, PostgreSQL backend, schema design, pagination, GSI consistency (`extenddb-storage`, `extenddb-storage-postgres` crates) | +| 04 | [Storage](04-component-storage.md) | StorageEngine traits, input/output types, PostgreSQL and TiDB backends, schema design, pagination, GSI consistency (`extenddb-storage`, `extenddb-storage-postgres`, `extenddb-storage-tidb` crates) | | 05 | [Auth](05-component-auth.md) | AuthProvider trait, SigV4 validation, IAM policy engine, credential encryption (`extenddb-auth` crate) | | 06 | [Server](06-component-server.md) | HTTP server, routing, middleware pipeline, response formatting, TLS, rate limiting, throughput tracking (`extenddb-server` crate) | | 07 | [Streams](07-component-streams.md) | DynamoDB Streams design space — capture mechanism, shard management, retention (high-level; detailed design deferred) | diff --git a/docs/differences-from-dynamodb.md b/docs/differences-from-dynamodb.md index 045484a..a9a0c9d 100755 --- a/docs/differences-from-dynamodb.md +++ b/docs/differences-from-dynamodb.md @@ -8,7 +8,7 @@ adaptation when switching between ExtendDB and the real service. | Area | DynamoDB | ExtendDB | |------|----------|------| -| Storage backend | Proprietary distributed storage | PostgreSQL | +| Storage backend | Proprietary distributed storage | Pluggable SQL storage backend. PostgreSQL is the default; TiDB is available with the `tidb` feature. | | Global Tables | CreateGlobalTable, replication | Not implemented (returns UnknownOperationException) | | DAX (Accelerator) | In-memory caching layer | Not applicable | | PartiQL | ExecuteStatement, BatchExecuteStatement | Not implemented (returns UnknownOperationException) | @@ -47,7 +47,7 @@ adaptation when switching between ExtendDB and the real service. | Area | DynamoDB | ExtendDB | |------|----------|------| | TTL attribute name | Any UTF-8 string (1–255 bytes) | Restricted to `[a-zA-Z0-9._-]+` (1–255 bytes). Names with spaces, quotes, or other special characters are rejected. This eliminates SQL injection risk in the TTL expression index. | -| TTL deletion | Background process, items deleted within 48 hours of expiry | Background worker with indexed sweep, configurable target via `ttl_deletion_target_seconds` (default: 300s) | +| TTL deletion | Background process, items deleted within 48 hours of expiry | Backend-specific. PostgreSQL uses an indexed sweep. TiDB uses native table TTL for non-streaming tables and keeps an indexed worker only where DynamoDB Streams REMOVE records must be emitted. | | TTL stream records | REMOVE events with `userIdentity: {type: "Service", principalId: "dynamodb.amazonaws.com"}` | Supported — TTL deletions generate REMOVE stream records with the same `userIdentity` | | TTL modification cooldown | Enforces a cooldown period between enable/disable changes ("Time to live has been modified multiple times within a fixed interval") | No cooldown — TTL can be enabled and disabled immediately. Intentional divergence for faster local development. | @@ -61,7 +61,7 @@ adaptation when switching between ExtendDB and the real service. | Area | DynamoDB | ExtendDB | |------|----------|------| -| GSI update propagation | Eventually consistent (milliseconds to seconds) | Per-GSI propagation delay. System default: `gsi_propagation_delay_ms` setting (default 10ms). Each GSI can override with its own `propagation_delay_ms` (stored in catalog). A value of 0 means synchronous (future sync GSI feature). | +| GSI update propagation | Eventually consistent (milliseconds to seconds) | Backend-specific. PostgreSQL can simulate asynchronous propagation via `gsi_propagation_delay_ms`; TiDB uses native secondary indexes maintained from the base table write. | | Multi-part base table keys | Not supported | Preview extension (opt-in via `enable_multipart_keys` setting). Standard single/composite keys work identically. | ## Capacity and Throttling @@ -89,14 +89,34 @@ ExtendDB exposes runtime settings that have no DynamoDB equivalent: | Setting | Default | Description | |---------|---------|-------------| -| `control_plane_delay_seconds` | 5 | Simulated delay for table state transitions (CREATING → ACTIVE, DELETING → removed) | -| `gsi_propagation_delay_ms` | 10 | System-wide default GSI propagation delay (milliseconds). Per-GSI overrides stored in catalog. 0 = synchronous. | +| `control_plane_delay_seconds` | 5 | Simulated delay for table create/delete state transitions (CREATING → ACTIVE, DELETING → removed). UpdateTable GSI/stream transitions report UPDATING until the backend reconciler completes, while table data-plane reads and writes remain available. | +| `gsi_propagation_delay_ms` | 10 | PostgreSQL backend default GSI propagation delay (milliseconds). TiDB ignores this setting because GSI writes are transactional. | | `throttling_enabled` | `true` | Enable provisioned capacity throttling (token bucket per table/partition) | | `enable_multipart_keys` | `false` | Enable multi-part base table key extension | | `log_level` | `info` | Runtime log level (trace, debug, info, warn, error) | | `sqlx_log_level` | `warn` | Separate log level for sqlx query traces | | `allow_credential_import` | `true` | Allow importing credentials via the management API | +## TiDB Native Backup/Restore + +The TiDB backend uses TiDB BR for backup data instead of copying items into +ExtendDB catalog tables. `CreateBackup` requires `[storage.tidb.backup]` +configuration (`pd_endpoint` and `storage_uri`). + +BR restores physical TiDB tables to their recorded database/table identity. +That means TiDB `RestoreTableFromBackup` is available only when the target TiDB +cluster is empty or conflict-free for the backed physical table. ExtendDB does +not emulate unsupported BR restore shapes by replaying item rows. + +Restored tables do not inherit TTL or stream settings. When BR restores a TiDB +table that previously used native TTL, ExtendDB strips the restored physical TTL +artifacts before publishing the target table as `ACTIVE`. + +`DeleteBackup` removes TiDB BR backup data only for `local://` or `file://` +backup URIs. For remote object stores, TiDB BR leaves backup-data lifecycle to +the operator or object-store lifecycle policy, so ExtendDB refuses metadata-only +deletion instead of reporting a false delete. + ## Web Console ExtendDB includes a built-in web management console at `/console` for credential diff --git a/docs/dynamodb-limits.md b/docs/dynamodb-limits.md index 5fded45..0fa7055 100755 --- a/docs/dynamodb-limits.md +++ b/docs/dynamodb-limits.md @@ -126,7 +126,7 @@ Source: [AWS DynamoDB Service Quotas](https://docs.aws.amazon.com/amazondynamodb | Limit | DynamoDB Value | Status | Notes | |-------|---------------|--------|-------| -| Concurrent restores | 50 | N/A | ExtendDB does not support backup/restore | +| Concurrent restores | 50 | Backend-specific | Backup/restore is implemented by the storage backend. PostgreSQL uses its backend implementation; TiDB delegates physical backup data to native BR. ExtendDB does not enforce DynamoDB's concurrent restore quota. | ## Global Tables diff --git a/docs/getting-started.md b/docs/getting-started.md index a953b00..4fd82d5 100755 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -6,8 +6,8 @@ This guide walks you through initializing a extenddb deployment, starting the se ### Platform-specific installation guides -- [macOS (Homebrew)](manuals/09-install-macos.md) — covers Homebrew PostgreSQL, macOS syslog, and `--pg-user $(whoami)` -- [Linux (Ubuntu/Debian, Amazon Linux, Fedora/RHEL)](manuals/08-install-linux.md) — covers system PostgreSQL, `journalctl`, and `--pg-user postgres` +- [macOS (Homebrew)](manuals/09-install-macos.md) — covers Homebrew PostgreSQL, macOS syslog, and `--storage-admin-user $(whoami)` +- [Linux (Ubuntu/Debian, Amazon Linux, Fedora/RHEL)](manuals/08-install-linux.md) — covers system PostgreSQL, `journalctl`, and `--storage-admin-user postgres` ### Installer scripts @@ -29,7 +29,7 @@ software on your behalf. After the script completes, continue from ## Prerequisites -- PostgreSQL 14+ running locally (see `docs/local-postgres-setup.md`) +- A supported storage backend. The default build uses PostgreSQL 14+ locally (see `docs/local-postgres-setup.md`); TiDB is available when building with the `tidb` feature. - Rust toolchain (1.85+) - AWS CLI v2 (for testing) - Python 3.10+ with virtual environment (see [Python Environment Setup](../README.md#python-environment-setup) in the README) @@ -51,7 +51,7 @@ Run `extenddb init` to create the catalog and data databases: ``` This will: -- Create a `extenddb` PostgreSQL user (if it doesn't exist) +- Create an `extenddb` storage user (if it doesn't exist) - Create the `extenddb_catalog` database (catalog metadata) - Create the `extenddb` database (user item data) - Run schema migrations @@ -79,16 +79,16 @@ To use a custom data database name: ### Remote PostgreSQL / Aurora -For remote PostgreSQL or Aurora, supply the admin password with `--pg-pass`: +For remote PostgreSQL or Aurora, supply the admin password with `--storage-admin-password`: ```bash # Pass the password inline: ./target/release/extenddb init \ - --pg-host my-aurora-cluster.cluster-xxxx.us-east-1.rds.amazonaws.com \ - --pg-user postgres --pg-pass + --storage-host my-aurora-cluster.cluster-xxxx.us-east-1.rds.amazonaws.com \ + --storage-admin-user postgres --storage-admin-password ``` -When `--pg-pass` is omitted entirely, `extenddb init` connects without a password, relying on +When `--storage-admin-password` is omitted entirely, `extenddb init` connects without a password, relying on PostgreSQL peer/ident authentication (works only on localhost via Unix socket). ### Custom bind address @@ -240,14 +240,14 @@ Controls whether `extenddb manage import-access-key` is allowed (default: `true` ### GSI Propagation Delay -GSI updates are applied asynchronously with a configurable delay, simulating real DynamoDB's eventually consistent GSI behavior. The system-wide default is 10ms. Each GSI can override this with a per-index `propagation_delay_ms` stored in the catalog. +The PostgreSQL backend can apply GSI updates asynchronously with a configurable delay, simulating real DynamoDB's eventually consistent GSI behavior. The TiDB backend uses TiDB transactions and native secondary indexes maintained from the base table write. ```bash -# Set system-wide default to 0 for synchronous GSI updates (fast tests) +# PostgreSQL only: set system-wide default to 0 for synchronous GSI updates ./target/release/extenddb settings --config extenddb.toml set \ gsi_propagation_delay_ms 0 -# Set to 50ms for more realistic eventual consistency +# PostgreSQL only: set to 50ms for more realistic eventual consistency ./target/release/extenddb settings --config extenddb.toml set \ gsi_propagation_delay_ms 50 ``` @@ -1178,10 +1178,8 @@ extenddb supports running external test suites (e.g., Java/JUnit, Python/pytest) # Start extenddb first ./target/release/extenddb serve --config extenddb.toml -# Set GSI propagation delay to 0 for external tests. -# External suites expect synchronous GSI behavior (matching real DynamoDB's -# typical sub-millisecond propagation). The async GSI path is tested -# separately by the extenddb-specific test_gsi_async.py suite. +# PostgreSQL only: external suites expect immediate GSI visibility. +# TiDB GSI writes are already transactional, so this setting is not needed there. ./target/release/extenddb settings --config extenddb.toml set gsi_propagation_delay_ms 0 # Run all registered suites diff --git a/docs/manuals/01-architecture-guide.md b/docs/manuals/01-architecture-guide.md index 783bbaa..c331ac6 100755 --- a/docs/manuals/01-architecture-guide.md +++ b/docs/manuals/01-architecture-guide.md @@ -4,13 +4,13 @@ ## Overview -extenddb (ExtendDB) is a standalone DynamoDB-compatible API server written in Rust. It receives DynamoDB wire protocol requests over HTTP/HTTPS, authenticates and authorizes them via SigV4 and a local IAM policy engine, executes operation logic in a backend-agnostic engine, and delegates persistence to a pluggable storage backend (currently PostgreSQL). +extenddb (ExtendDB) is a standalone DynamoDB-compatible API server written in Rust. It receives DynamoDB wire protocol requests over HTTP/HTTPS, authenticates and authorizes them via SigV4 and a local IAM policy engine, executes operation logic in a backend-agnostic engine, and delegates persistence to a pluggable storage backend. PostgreSQL is the default backend; TiDB is available as an optional backend. extenddb runs as a daemon process, logging to syslog. It is designed for any environment where DynamoDB semantics are needed — local development, CI pipelines, self-hosted production, multi-cloud, or air-gapped deployments. Developers and applications point their AWS SDKs at extenddb and get identical DynamoDB behavior. ## Cargo Workspace -The project is structured as a Cargo workspace with 7 crates. Crate boundaries enforce dependency rules at compile time. +The project is structured as a Cargo workspace with 8 crates. Crate boundaries enforce dependency rules at compile time. ``` extenddb/ @@ -19,6 +19,7 @@ extenddb/ │ ├── engine/ Async operation handlers (PutItem, Query, etc.) │ ├── storage/ Storage trait definitions and backend-agnostic utilities │ ├── storage-postgres/ PostgreSQL backend implementation +│ ├── storage-tidb/ TiDB backend implementation │ ├── auth/ AuthProvider trait, SigV4 verification, IAM policy engine │ ├── server/ HTTP server (axum), management API, web console │ └── bin/ CLI entry point, config loading, daemon lifecycle @@ -35,6 +36,7 @@ bin ──→ server ──→ engine ──→ core │ └──→ storage │ ├──→ storage-postgres ──→ storage ──→ core + ├──→ storage-tidb ──────→ storage ──→ core └──→ core ``` @@ -84,7 +86,7 @@ Trait definitions for the storage layer. Thirteen storage traits partition backe - **AuthorizationStore**: Policy evaluation cache - **Bootstrapper**: Initial database setup -Traits use `BoxFuture` for object safety. Backends register at compile time via the `inventory` crate and are selected at startup by name. The `RuntimeHooks` trait allows backends to spawn backend-specific workers (PostgreSQL spawns 7). +Traits use `BoxFuture` for object safety. Backends register at compile time via the `inventory` crate and are selected at startup by name. The `RuntimeHooks` trait allows backends to spawn backend-specific workers. ### storage-postgres @@ -93,11 +95,19 @@ PostgreSQL implementation of all storage traits using `sqlx`. Features: - Dual-database architecture: catalog DB (metadata) + data DB (user items) - Schema migrations managed by version-stamped SQL files - Items stored as JSONB with indexed key columns -- GSI/LSI implemented as separate PostgreSQL tables +- GSI/LSI metadata in the catalog; physical index layout is backend-specific - Transactions use `SELECT FOR UPDATE` + single-transaction commits - Stream records stored in a dedicated table with background cleanup - All queries parameterized (no dynamic SQL construction) +### storage-tidb + +TiDB implementation of the storage traits using the sqlx MySQL driver. It mirrors the PostgreSQL backend structure while using TiDB-compatible SQL, MySQL-style connection strings, `ON DUPLICATE KEY UPDATE` upserts, and TiDB/MySQL error classification. It is selected with `storage.backend = "tidb"` when the binary is built with the `tidb` feature. + +TiDB backups use BR as the physical backup data plane. ExtendDB stores backup +metadata in the catalog and delegates snapshot data to BR storage; it does not +copy items into catalog backup tables for TiDB. + ### auth Authentication and authorization: @@ -125,7 +135,7 @@ Thin binary that wires everything together: - CLI parsing (clap): `serve`, `init`, `destroy`, `verify`, `migrate`, `status`, `settings`, `manage`, `version` - Configuration loading (TOML + env vars) - Daemon lifecycle (bind socket → fork → syslog → serve) -- Background tasks (log level polling, throttling polling, GSI delay polling, stream record cleanup, TTL expiry, metrics persistence) +- Background tasks (log level polling, throttling polling, stream record cleanup, TTL expiry, metrics persistence) ## Request Lifecycle @@ -147,18 +157,18 @@ extenddb always runs as a daemon. There is no foreground mode. 2. Bind TCP socket (port conflicts reported before forking) 3. Fork to background via `daemonize` 4. Initialize syslog logging -5. Connect to PostgreSQL (catalog + data databases) +5. Connect to the configured storage backend (catalog + data databases) 6. Verify catalog version matches binary expectation 7. Start axum server on the pre-bound socket -8. Spawn background tasks (log level polling, throttling polling, GSI delay polling, stream cleanup, TTL expiry, metrics persistence) +8. Spawn background tasks (log level polling, throttling polling, stream cleanup, TTL expiry, metrics persistence) 9. On SIGTERM/SIGINT: drain connections (5s timeout), exit ## Catalog Model -extenddb uses a dual-database architecture: +extenddb uses a catalog/data storage architecture: - **Catalog database** (e.g., `extenddb_catalog`): Stores table metadata, account/user/group/role/policy definitions, access keys, settings, stream metadata, and metrics. Shared across all accounts. -- **Data database** (e.g., `extenddb_catalog_data`): Stores user items, GSI/LSI data, and stream records. Each table gets its own PostgreSQL table. +- **Data database** (e.g., `extenddb_catalog_data`): Stores user items, backend-specific secondary-index state, and stream records. PostgreSQL uses companion data/index tables. TiDB stores item rows once and uses generated columns plus native secondary indexes. The catalog version (currently 0.0.2) is stored in the `catalog_metadata` table and checked at startup. Version mismatches prevent the server from starting — run `extenddb migrate` to upgrade. @@ -166,7 +176,7 @@ The catalog version (currently 0.0.2) is stored in the `catalog_metadata` table ### Storage -Storage backends implement thirteen traits (see **storage** section above). The traits use `BoxFuture` for object safety. Backends register at compile time via the `inventory` crate, and the `bin` crate selects the backend by name at startup. Currently only PostgreSQL is implemented. +Storage backends implement thirteen traits (see **storage** section above). The traits use `BoxFuture` for object safety. Backends register at compile time via the `inventory` crate, and the `bin` crate selects the backend by name at startup. PostgreSQL is the default backend, and TiDB is available behind the optional `tidb` feature. ### Authentication @@ -176,7 +186,7 @@ Auth providers implement the `AuthProvider` trait using `#[async_trait]` for obj ### Authentication -SigV4 signature verification follows the AWS specification. Credentials are stored encrypted (AES-256-GCM) in PostgreSQL. Access key prefixes distinguish long-term (`VDAK`) from temporary (`VDSK`) credentials. +SigV4 signature verification follows the AWS specification. Credentials are stored encrypted (AES-256-GCM) in the catalog database. Access key prefixes distinguish long-term (`VDAK`) from temporary (`VDSK`) credentials. ### Authorization @@ -248,7 +258,7 @@ Configuration precedence: CLI flags > environment variables > config file > defa ## DynamoDB Streams -extenddb implements DynamoDB Streams for change data capture. Stream records are captured atomically with data writes inside the same PostgreSQL transaction. Both the DynamoDB API and Streams API are served on the same port. +extenddb implements DynamoDB Streams for change data capture. Stream records are captured atomically with data writes inside the backend transaction. Both the DynamoDB API and Streams API are served on the same port. Supported operations: `ListStreams`, `DescribeStream`, `GetShardIterator`, `GetRecords`. @@ -256,14 +266,14 @@ Stream records are retained for 24 hours. A background worker cleans up expired ## Deployment Models -extenddb is a single-binary server that connects to PostgreSQL. Deployment options include: +extenddb is a single-binary server that connects to a configured storage backend. Deployment options include: -- **Single-node**: extenddb + PostgreSQL on the same host (development, small workloads) -- **Separated**: extenddb on an application server, PostgreSQL on a dedicated database server or managed service (RDS, Aurora, Cloud SQL) -- **Containerized**: Docker/Kubernetes with PostgreSQL as a sidecar or external service +- **Single-node**: extenddb + storage backend on the same host (development, small workloads) +- **Separated**: extenddb on an application server, storage backend on a dedicated database server or managed service +- **Containerized**: Docker/Kubernetes with the storage backend as a sidecar or external service - **Air-gapped**: No internet connectivity required; all functionality is self-contained -PostgreSQL provides the durability, replication, and backup capabilities. Use standard PostgreSQL HA tools (streaming replication, Patroni, managed services) for production availability. +The storage backend provides durability, replication, and physical backup capabilities. Use backend-native HA tools: PostgreSQL streaming replication/managed services for PostgreSQL, and TiDB's PD/TiKV topology plus BR for TiDB. --- diff --git a/docs/manuals/02-design-guide.md b/docs/manuals/02-design-guide.md index f775401..d6e7291 100755 --- a/docs/manuals/02-design-guide.md +++ b/docs/manuals/02-design-guide.md @@ -6,12 +6,12 @@ ### Dual-Database Architecture -extenddb uses two PostgreSQL databases per deployment: +extenddb uses a catalog/data database topology per deployment: - **Catalog database** (e.g., `extenddb_catalog`): All metadata — table definitions, indexes, accounts, IAM entities, settings, stream metadata, and schema history. -- **Data database** (e.g., `extenddb_catalog_data`): User item data. Each DynamoDB table maps to a PostgreSQL table. GSI and LSI data are stored in separate PostgreSQL tables. +- **Data database** (e.g., `extenddb_catalog_data`): User item data plus backend-specific secondary-index state. PostgreSQL stores base and secondary-index data in physical companion tables. TiDB stores item rows once and uses generated columns plus native secondary indexes. -The data database connection string is stored in the catalog's `settings` table under the key `data_database_url`. This allows the catalog and data databases to live on different PostgreSQL instances. +The data database connection string is stored in the catalog's `settings` table under the key `data_database_url`. This allows the catalog and data databases to live on different backend instances or clusters. ### Catalog Tables @@ -41,14 +41,14 @@ The data database connection string is stored in the catalog's `settings` table ### Data Tables -Each DynamoDB table `T` in account `A` maps to a PostgreSQL table named `t_{table_id}` in the data database. The table has: +Each DynamoDB table `T` in account `A` maps to a backend-owned physical table in the data database. The logical shape is: - `pk` column: Partition key value (stored as JSONB) - `sk` column: Sort key value (JSONB, nullable for hash-only tables) - `item` column: Full item as JSONB - Primary key: `(pk)` or `(pk, sk)` -GSI tables are named `gsi_{table_id}_{index_name}` with the GSI key columns and a copy of projected attributes. LSI tables are named `lsi_{table_id}_{index_name}`. +PostgreSQL GSI tables are named `gsi_{table_id}_{index_name}` with the GSI key columns and a copy of projected attributes. TiDB represents every DynamoDB secondary index definition as generated key columns plus a native secondary index on the base data table; GSI versus LSI is API metadata, not a separate TiDB physical path. ### Schema Conventions @@ -132,7 +132,7 @@ Credential lookups (access key → encrypted secret) read directly from the data ### Record Capture -Stream records are captured atomically with data writes. The engine constructs a `StreamCapture` struct with metadata (stream ARN, view type, shard ID, sequence number, keys). The storage backend persists the stream record in the same PostgreSQL transaction as the data write. +Stream records are captured atomically with data writes. The engine constructs a `StreamCapture` struct with metadata (stream ARN, view type, shard ID, sequence number, keys). The storage backend persists the stream record in the same backend transaction as the data write. For UpdateItem, the `new_image` is not known until after `apply_update` runs inside the transaction, so the storage backend constructs the full `StreamRecord` after the update. @@ -185,7 +185,6 @@ extenddb caches a small set of operational settings in memory to avoid per-reque | Setting | Mechanism | Refresh | Justification | |---------|-----------|---------|---------------| -| `gsi_propagation_delay_ms` | `AtomicU64` | Background poller every 30s | Write-path hot path; briefly-stale value only affects GSI propagation timing | | `encryption_key` | `Arc` loaded at startup | Never (immutable after `extenddb init`) | Decryption key for access key secrets; generated once, never changes | | `log_level` / `log_destination` | Tracing filter reload | Background poller every 30s | Observability tuning; stale value only delays log level changes | | `throttling_enabled` | `AtomicBool` | Background poller every 30s | Capacity management toggle; briefly-stale is safe | @@ -199,7 +198,7 @@ Catalog state is never cached because correctness requires every request to see - **Table metadata** (key schema, attribute definitions, status, billing mode): A stale cache could serve the wrong key schema after a table is deleted and recreated with the same name but different schema. The new table has a different `table_id`, different key schema, and different indexes — stale cache serves wrong schema, writes corrupt data, reads return garbage. - **IAM policies and credentials**: A revoked Deny policy still cached as absent creates a security gap. A deleted access key still cached as valid allows unauthorized access. - **Tags**: Tag-based authorization (`dynamodb:ResourceTag/*`) requires current tag values. -- **GSI definitions**: Stale GSI metadata could route writes to wrong index tables. +- **GSI definitions**: Stale GSI metadata could route reads or writes through the wrong backend-specific index shape. ### The Table-Name-Reuse Problem @@ -212,11 +211,11 @@ The fundamental reason catalog state cannot be cached safely: 5. Writes use wrong column layout → data corruption 6. Reads return items with wrong attribute interpretation → garbage -No safe TTL exists because delete-recreate can happen within milliseconds. Cross-instance invalidation (e.g., PostgreSQL LISTEN/NOTIFY) would be a prerequisite for any future catalog caching. +No safe TTL exists because delete-recreate can happen within milliseconds. Cross-instance invalidation through backend-native change notifications would be a prerequisite for any future catalog caching. ### Multi-Instance Considerations -extenddb does not enforce single-instance-per-catalog. Multiple extenddb instances may share the same PostgreSQL catalog. Any in-process cache of catalog state would be invisible to other instances. PostgreSQL's own buffer pool provides memory-resident access to hot rows, making application-level caching unnecessary for most workloads. +extenddb does not enforce single-instance-per-catalog. Multiple extenddb instances may share the same catalog. Any in-process cache of catalog state would be invisible to other instances. Backend buffer pools provide memory-resident access to hot rows, making application-level caching unnecessary for most workloads. ### Future Considerations diff --git a/docs/manuals/04-quickstart-setup-guide.md b/docs/manuals/04-quickstart-setup-guide.md index 987cf57..d0ffa6c 100755 --- a/docs/manuals/04-quickstart-setup-guide.md +++ b/docs/manuals/04-quickstart-setup-guide.md @@ -149,10 +149,10 @@ Options: |------|---------|-------------| | `--catalog-db` | `extenddb_catalog` | Catalog database name | | `--data-db` | `_data` | Data database name | -| `--pg-host` | `localhost` | PostgreSQL host | -| `--pg-port` | `5432` | PostgreSQL port | -| `--pg-user` | `extenddb` | PostgreSQL user | -| `--pg-password` | `extenddb-local-dev` | PostgreSQL password | +| `--storage-host` | `localhost` | PostgreSQL host | +| `--storage-port` | `5432` | PostgreSQL port | +| `--storage-admin-user` | `extenddb` | PostgreSQL user | +| `--storage-admin-password` | `extenddb-local-dev` | PostgreSQL password | The command generates `extenddb.toml` with the connection details. If `extenddb.toml` already exists, `init` loads defaults from it. diff --git a/docs/manuals/05-admin-guide.md b/docs/manuals/05-admin-guide.md index 0e00938..87f7b86 100755 --- a/docs/manuals/05-admin-guide.md +++ b/docs/manuals/05-admin-guide.md @@ -16,7 +16,7 @@ extenddb always runs as a daemon. On startup it: 2. Binds the TCP socket (port conflicts are reported before forking) 3. Forks to background 4. Initializes syslog logging -5. Connects to PostgreSQL (catalog + data databases) +5. Connects to the configured storage backend (catalog + data databases) 6. Verifies catalog version matches the binary 7. Starts the HTTP server 8. Spawns background tasks (log level polling, stream cleanup, TTL expiry) @@ -67,7 +67,7 @@ These settings require a server restart to take effect. | Key | Default | Description | |-----|---------|-------------| -| `backend` | `postgres` | Storage backend (only `postgres` supported) | +| `backend` | `postgres` | Storage backend (`postgres` or `tidb`; PostgreSQL is the default) | #### [storage.postgres] @@ -77,6 +77,33 @@ These settings require a server restart to take effect. | `pool_size` | `20` | Maximum concurrent database connections (minimum: 10) | | `catalog_pool_size` | (= `pool_size`) | Maximum connections for management/authz pool (minimum: 10) | +#### [storage.tidb] + +Available when the binary is built with the `tidb` feature. + +| Key | Default | Description | +|-----|---------|-------------| +| `connection_string` | `mysql://extenddb:extenddb-local-dev@localhost:4000/extenddb_catalog` | Catalog database connection string | +| `pool_size` | `20` | Maximum concurrent database connections (minimum: 10) | +| `catalog_pool_size` | (= `pool_size`) | Maximum connections for management/authz pool (minimum: 10) | + +#### [storage.tidb.backup] + +TiDB backup and restore uses native BR, not a logical row-copy table. Configure these fields before using `CreateBackup` with the TiDB backend. +`DeleteBackup` removes backup data directly only for `local://` or `file://` +storage URIs. For S3, GCS, Azure Blob, and S3-compatible stores, configure +object-store lifecycle management; ExtendDB will not mark remote BR backups +deleted without a storage deleter. + +| Key | Default | Description | +|-----|---------|-------------| +| `pd_endpoint` | unset | PD endpoint passed to BR, for example `127.0.0.1:2379` | +| `storage_uri` | unset | Base URI for BR snapshot backups (`local://`, S3, GCS, Azure Blob, or compatible storage supported by BR) | +| `log_storage_uri` | unset | Reserved for future cluster-level BR log backup orchestration; table-level PITR is not exposed by the TiDB backend | +| `binary` | `tiup` | Executable used to run BR | +| `component` | `br` | Component/subcommand after `binary`; set to `""` when `binary` is a direct `br` executable | +| `send_credentials_to_tikv` | unset | Maps to BR `--send-credentials-to-tikv`; set `false` for IAM-role based S3 access | + #### [auth] | Key | Default | Description | @@ -113,6 +140,7 @@ Any config key can be overridden via environment variables using the `EXTENDDB__ ```bash EXTENDDB__SERVER__PORT=9000 EXTENDDB__STORAGE__POSTGRES__CONNECTION_STRING="postgresql://..." +EXTENDDB__STORAGE__TIDB__CONNECTION_STRING="mysql://..." EXTENDDB__AUTH__PROVIDER=builtin ``` @@ -462,7 +490,7 @@ Another process is using the port. Find it with `ss -tlnp | grep :8000` and stop Error: error communicating with database ``` -Check that PostgreSQL is running and the connection string in `extenddb.toml` is correct. +Check that the configured storage backend is running and the connection string in `extenddb.toml` is correct. **Catalog version mismatch:** @@ -490,7 +518,7 @@ The IAM policy does not allow the operation. Check attached policies with `list- **Slow queries:** -Check PostgreSQL query performance with `EXPLAIN ANALYZE`. Ensure indexes exist on key columns. +Check the configured storage backend's query plan tools (`EXPLAIN ANALYZE` for PostgreSQL or TiDB). Ensure indexes exist on key columns. **High connection count:** @@ -498,7 +526,9 @@ Increase `pool_size` in `extenddb.toml` or check for connection leaks. ### Data Recovery -extenddb stores all data in PostgreSQL. Use standard PostgreSQL backup and recovery tools: +Use the configured storage backend's native backup and recovery path. + +For PostgreSQL, use standard PostgreSQL tools: ```bash # Backup @@ -510,6 +540,8 @@ psql -f catalog_backup.sql extenddb_catalog psql -f data_backup.sql extenddb_catalog_data ``` +For TiDB, configure `[storage.tidb.backup]` and use DynamoDB-compatible backup APIs backed by native TiDB BR, or operate BR directly at the cluster level for full-cluster recovery. + --- ## License diff --git a/docs/manuals/06-developer-test-guide.md b/docs/manuals/06-developer-test-guide.md index b182eec..5425118 100755 --- a/docs/manuals/06-developer-test-guide.md +++ b/docs/manuals/06-developer-test-guide.md @@ -156,7 +156,7 @@ The `run-tests` script automatically: - Provisions test credentials via `devtools/provision-test-credentials` - Creates a Java truststore for external tests (self-signed TLS cert) - Sets `control_plane_delay_seconds` to 0.05 for fast test cycles -- Sets `gsi_propagation_delay_ms` to 0 for immediate GSI updates +- Configures backend-specific test settings for immediate control-plane visibility - Enables throttling for production-like behavior - Configures import/export paths for file operation tests - Extracts and exports `EXTENDDB_TEST_PG_CONNECTION_STRING` for CLI lifecycle tests diff --git a/docs/manuals/08-install-linux.md b/docs/manuals/08-install-linux.md index 41c71c2..ee06ead 100755 --- a/docs/manuals/08-install-linux.md +++ b/docs/manuals/08-install-linux.md @@ -90,11 +90,11 @@ Do **not** hand-write `extenddb.toml` before running `init`. On most Linux systems the PostgreSQL admin user is `postgres`: ```bash -./target/release/extenddb init --pg-user postgres +./target/release/extenddb init --storage-admin-user postgres ``` If you run PostgreSQL as your own user (e.g., Amazon Linux 2 with a -user-owned data directory), omit `--pg-user` — it defaults to `$(whoami)`: +user-owned data directory), omit `--storage-admin-user` — it defaults to `$(whoami)`: ```bash ./target/release/extenddb init @@ -198,7 +198,7 @@ No data is lost; only the catalog schema is updated. |--------------------------------------------------------|---------------------------------------------------------------------| | `connection refused` on port 8000 | Server not running. `./target/release/extenddb serve --config extenddb.toml`| | `Catalog version X.Y.Z (binary expects A.B.C)` | `./target/release/extenddb migrate --config extenddb.toml` | -| `role "postgres" does not exist` | Use `--pg-user $(whoami)` if PG runs as your user | +| `role "postgres" does not exist` | Use `--storage-admin-user $(whoami)` if PG runs as your user | | `FATAL: Peer authentication failed` | Edit `pg_hba.conf` to allow `trust` or `md5` for local connections | | DROP DATABASE hangs after hard kill | Check for lingering backends: `ps -eo pid,command \| grep postgres` | diff --git a/docs/manuals/09-install-macos.md b/docs/manuals/09-install-macos.md index 092736e..bc7b772 100755 --- a/docs/manuals/09-install-macos.md +++ b/docs/manuals/09-install-macos.md @@ -75,7 +75,7 @@ On macOS you must tell `init` which PostgreSQL user to connect as for the `CREATE ROLE` / `CREATE DATABASE` steps — your macOS username: ```bash -./target/release/extenddb init --pg-user $(whoami) +./target/release/extenddb init --storage-admin-user $(whoami) ``` This prints the admin credentials **once**. Save them — they cannot be @@ -175,7 +175,7 @@ No data is lost; only the catalog schema is updated. | Item | Linux | macOS (Homebrew) | |--------------------|--------------------------------|-----------------------------------------------------| | PG admin user | `postgres` (or custom) | Your macOS username (`$(whoami)`), no password | -| `extenddb init` flags | defaults usually fine | pass `--pg-user $(whoami)` | +| `extenddb init` flags | defaults usually fine | pass `--storage-admin-user $(whoami)` | | Service manager | `systemctl` / `pg_ctl` | `brew services` or `pg_ctl` | | Syslog reader | `journalctl -t extenddb` | `log stream --predicate 'processImagePath ENDSWITH "extenddb"'` | @@ -185,7 +185,7 @@ No data is lost; only the catalog schema is updated. |--------------------------------------------------------|---------------------------------------------------------------------| | `connection refused` on port 8000 | Server not running. `./target/release/extenddb serve --config extenddb.toml`| | `Catalog version X.Y.Z (binary expects A.B.C)` | `./target/release/extenddb migrate --config extenddb.toml` | -| `role "extenddb" does not exist` during init | Re-run with `--pg-user $(whoami)` | +| `role "extenddb" does not exist` during init | Re-run with `--storage-admin-user $(whoami)` | | DROP DATABASE hangs after hard kill | Check for lingering backends: `ps -eo pid,command \| grep postgres` | See `docs/troubleshooting.md` for the full troubleshooting guide. diff --git a/docs/manuals/10-security-model.md b/docs/manuals/10-security-model.md index 571f812..a0f26a3 100755 --- a/docs/manuals/10-security-model.md +++ b/docs/manuals/10-security-model.md @@ -9,7 +9,7 @@ This document describes the security architecture of extenddb, including the thr ### What extenddb Protects - **Data confidentiality**: Items stored in DynamoDB tables are accessible only to authenticated and authorized principals. -- **Data integrity**: Write operations are atomic (including stream records, index updates, and side effects). Concurrent writes serialize on PostgreSQL row locks. +- **Data integrity**: Write operations are atomic (including stream records, index updates, and side effects). Concurrent writes use the configured backend's transaction and locking model. - **Access control**: IAM policies enforce least-privilege access. Explicit Deny always takes precedence. - **Credential security**: Access key secrets are encrypted at rest (AES-256-GCM). Console passwords are hashed (bcrypt). - **Transport security**: TLS encrypts data in transit between clients and extenddb. TLS is mandatory — the server refuses to start without it. @@ -17,14 +17,14 @@ This document describes the security architecture of extenddb, including the thr ### Trust Boundaries 1. **Client ↔ extenddb**: Untrusted. All input is validated. SigV4 signatures are verified. IAM policies are evaluated. -2. **extenddb ↔ PostgreSQL**: Trusted network. The PostgreSQL connection string contains credentials. Use TLS for the PostgreSQL connection in production (`sslmode=require` in the connection string). +2. **extenddb ↔ storage backend**: Trusted network. The backend connection string contains credentials. Use backend transport encryption in production. 3. **Admin ↔ Management API/Console**: Authenticated via admin credentials or IAM user credentials. CSRF tokens protect the web console. ### Out of Scope -- **PostgreSQL security**: extenddb relies on PostgreSQL access controls and network security. Securing the PostgreSQL instance (firewall rules, TLS, authentication) is the operator's responsibility. +- **Storage security**: extenddb relies on backend access controls and network security. Securing the storage cluster (firewall rules, TLS, authentication) is the operator's responsibility. - **Operating system security**: File permissions on `extenddb.toml`, TLS keys, and the PID file are the operator's responsibility. -- **Key management**: Access key secrets are encrypted with a locally generated AES key stored in the catalog database. For HSM-grade key management, use a KMS-backed encryption layer at the PostgreSQL level. +- **Key management**: Access key secrets are encrypted with a locally generated AES key stored in the catalog database. For HSM-grade key management, use a KMS-backed encryption layer at the storage backend level. ## Authentication @@ -56,7 +56,7 @@ extenddb uses SigV4 signature verification with a local credential store and IAM - Secret keys encrypted with AES-256-GCM using a per-catalog encryption key - Encryption key generated during `extenddb init` and stored in the catalog database - Console passwords hashed with bcrypt (cost factor 12) -- No in-process credential cache — every request reads directly from PostgreSQL +- No in-process credential cache — every request reads directly from the catalog store ## Authorization @@ -81,7 +81,7 @@ Policy sources collected for evaluation: - **Unparseable policies deny**: A stored policy that cannot be parsed results in access denied, not silent skip. A corrupted Deny policy still denies; a corrupted Allow policy is treated as absent. - **Auth before JSON parse**: SigV4 signature verification runs before the request body is parsed. Invalid signatures are rejected with constant-time comparison before any business logic executes. -- **Concurrent policy fetching**: Identity policies, group policies, and permissions boundaries are fetched concurrently from PostgreSQL. All must succeed for evaluation to proceed. +- **Concurrent policy fetching**: Identity policies, group policies, and permissions boundaries are fetched concurrently from the catalog store. All must succeed for evaluation to proceed. - **Constant-time rejection for inactive keys**: Inactive or expired access keys are rejected without timing differences that could reveal key existence. - **Policy document validation on write**: Policy documents are validated for JSON structure and size-capped (6,144 bytes) when attached via the management API. Invalid documents are rejected before storage. - **Expression depth and token limits**: Expression parsing enforces configurable depth (default 150) and token limits (default 4,096) to prevent resource exhaustion. @@ -232,7 +232,9 @@ Import/export file paths are validated: ### Backup and Recovery -extenddb stores all state in PostgreSQL. Use standard PostgreSQL tools for backup and recovery: +Use the configured storage backend's native backup and recovery tools. + +For PostgreSQL: ```bash pg_dump extenddb_catalog > catalog_backup.sql @@ -241,6 +243,8 @@ pg_dump extenddb_catalog_data > data_backup.sql Encryption keys are stored in the catalog database. A catalog backup includes the encryption key needed to decrypt access key secrets. +For TiDB, use native BR-backed backups through `[storage.tidb.backup]` or operate BR directly for cluster-level recovery. + --- ## License diff --git a/docs/manuals/11-deployment-guide.md b/docs/manuals/11-deployment-guide.md index 7b0a2d2..ba0a52f 100755 --- a/docs/manuals/11-deployment-guide.md +++ b/docs/manuals/11-deployment-guide.md @@ -6,23 +6,23 @@ This guide covers deploying extenddb in various environments beyond local develo ## Architecture Overview -extenddb is a single Rust binary that connects to PostgreSQL. All state lives in PostgreSQL — extenddb itself is stateless (no in-process caching). This means: +extenddb is a single Rust binary that connects to a configured storage backend. All durable state lives in that backend — extenddb itself is stateless (no in-process caching). This means: -- Multiple extenddb instances can share a PostgreSQL catalog (with caveats — see Multi-Instance below) -- Standard PostgreSQL HA, backup, and replication tools provide durability -- extenddb can run anywhere PostgreSQL is reachable +- Multiple extenddb instances can share a catalog (with caveats — see Multi-Instance below) +- Backend-native HA, backup, and replication tools provide durability +- extenddb can run anywhere the configured storage backend is reachable ## Deployment Models ### Single-Node -extenddb and PostgreSQL on the same host. Simplest setup, suitable for development, CI, and small workloads. +extenddb and the storage backend on the same host. Simplest setup, suitable for development, CI, and small workloads. ``` ┌─────────────────────────┐ │ Host │ │ ┌─────┐ ┌──────────┐ │ -│ │ extenddb│──│PostgreSQL │ │ +│ │ extenddb│──│Storage DB │ │ │ └─────┘ └──────────┘ │ └─────────────────────────┘ ``` @@ -34,19 +34,19 @@ extenddb serve --config extenddb.toml ### Separated Database -extenddb on an application server, PostgreSQL on a dedicated database server or managed service. +extenddb on an application server, storage backend on a dedicated database server or managed service. ``` ┌──────────┐ ┌──────────────┐ │ App Host│──────▶│ DB Host │ -│ extenddb │ │ PostgreSQL │ +│ extenddb │ │ Storage DB │ └──────────┘ └──────────────┘ ``` ```bash extenddb init \ - --pg-host db.example.com \ - --pg-pass + --storage-host db.example.com \ + --storage-admin-password ``` Configure the connection string in `extenddb.toml`: @@ -71,14 +71,33 @@ extenddb works with any PostgreSQL 14+ service: ```bash # Amazon RDS / Aurora example extenddb init \ - --pg-host mydb.cluster-abc123.us-east-1.rds.amazonaws.com \ - --pg-user extenddb_admin \ - --pg-pass + --storage-host mydb.cluster-abc123.us-east-1.rds.amazonaws.com \ + --storage-admin-user extenddb_admin \ + --storage-admin-password ``` +### Managed TiDB + +When built with the `tidb` feature, extenddb can use TiDB's MySQL-compatible endpoint: + +```toml +[storage] +backend = "tidb" + +[storage.tidb] +connection_string = "mysql://extenddb:@tidb.example.com:4000/extenddb_catalog" + +[storage.tidb.backup] +pd_endpoint = "pd.example.com:2379" +storage_uri = "s3://extenddb-backups/prod" +send_credentials_to_tikv = false +``` + +Use TiDB-native HA for the cluster and BR for physical backup/restore. + ### Containerized -extenddb runs in Docker or Kubernetes. The binary has no runtime dependencies beyond libc and network access to PostgreSQL. +extenddb runs in Docker or Kubernetes. The binary has no runtime dependencies beyond libc and network access to the configured storage backend. Example Dockerfile: @@ -124,7 +143,7 @@ For Kubernetes, run `extenddb init` as an init container or a one-time Job, then extenddb requires no internet connectivity. All functionality is self-contained in the binary. Build on a connected host, transfer the binary and `extenddb.toml` to the air-gapped environment, and run. Requirements in the air-gapped environment: -- PostgreSQL 14+ (reachable from the extenddb host) +- A supported storage backend reachable from the extenddb host - The `extenddb` binary (statically linked or with matching libc) ## Production Checklist @@ -148,13 +167,13 @@ Requirements in the air-gapped environment: - [ ] Firewall: allow only necessary ports (extenddb port, PostgreSQL port) - [ ] Consider a reverse proxy (nginx, HAProxy) for TLS termination, rate limiting, and access logging -### Database +### Storage Backend -- [ ] Use a dedicated PostgreSQL user for extenddb with minimal privileges -- [ ] Configure PostgreSQL `max_connections` ≥ extenddb `pool_size` + 3 -- [ ] Enable PostgreSQL TLS -- [ ] Set up automated backups (pg_dump, WAL archiving, or managed service snapshots) -- [ ] Monitor PostgreSQL disk usage, connection count, and query performance +- [ ] Use a dedicated backend user for extenddb with minimal privileges +- [ ] Size backend connection limits for `pool_size + catalog_pool_size + workers` +- [ ] Enable backend transport encryption +- [ ] Set up automated backups with backend-native tools (PostgreSQL pg_dump/WAL or TiDB BR) +- [ ] Monitor backend disk usage, connection count, replication health, and query performance ### Monitoring @@ -203,12 +222,12 @@ sudo systemctl start extenddb ## Multi-Instance Considerations -Multiple extenddb instances can connect to the same PostgreSQL catalog. However: +Multiple extenddb instances can connect to the same catalog. However: -- extenddb does not cache database state in-process — every request reads directly from PostgreSQL +- extenddb does not cache database state in-process — every request reads directly from the storage backend - This means multiple instances see consistent data without cache invalidation -- PostgreSQL's connection pool and row-level locking handle concurrent access -- Ensure `pool_size × instance_count + 3 × instance_count ≤ PostgreSQL max_connections` +- The storage backend's connection pool and transaction model handle concurrent access +- Ensure `pool_size × instance_count + worker overhead` fits the backend's connection limits ## Performance Tuning @@ -221,7 +240,7 @@ The default `pool_size = 20` is suitable for moderate workloads. For high-concur pool_size = 50 # Increase for higher concurrency ``` -Ensure PostgreSQL `max_connections` accommodates the total pool size plus overhead. +Ensure the backend's connection limit accommodates the total pool size plus overhead. ### PostgreSQL Tuning diff --git a/docs/manuals/12-extending-extenddb-storage.md b/docs/manuals/12-extending-extenddb-storage.md index 52aa54c..2c9b376 100755 --- a/docs/manuals/12-extending-extenddb-storage.md +++ b/docs/manuals/12-extending-extenddb-storage.md @@ -4,13 +4,13 @@ ## Introduction -extenddb uses a fully trait-based storage abstraction. The default backend is PostgreSQL, implemented in the `storage-postgres` crate. This document explains the storage architecture, lists every trait a new backend must implement, and provides guidance for adding a new storage backend. +extenddb uses a fully trait-based storage abstraction. The default backend is PostgreSQL, implemented in the `storage-postgres` crate. TiDB is available as an optional in-tree backend in `storage-tidb`. This document explains the storage architecture, lists every trait a new backend must implement, and provides guidance for adding a new storage backend. -As of v0.0.81, the server crate has **no direct PostgreSQL dependencies**. All database access goes through traits defined in the `storage` and `auth` crates. PostgreSQL-specific code lives exclusively in `storage-postgres` and the `bin` crate's wiring layer. +As of v0.0.81, the server crate has **no direct database driver dependencies**. All database access goes through traits defined in the `storage` and `auth` crates. Backend-specific code lives in backend crates such as `storage-postgres` and `storage-tidb`, with the `bin` crate acting as the wiring layer. ## Architecture Overview -extenddb is organized as a Cargo workspace with seven crates: +extenddb is organized as a Cargo workspace with eight crates: ``` bin → CLI entry point (init, serve, stop, migrate, manage, etc.) @@ -20,9 +20,10 @@ core → Types, expressions, validation (sync, no async runtime) auth → SigV4 verification, policy evaluation (trait-based credential store) storage → Trait definitions and backend-agnostic utilities (ARN construction, key parsing) storage-postgres → PostgreSQL implementation of all storage traits +storage-tidb → TiDB implementation of all storage traits ``` -The key architectural principle: neither the `engine` nor the `server` crate touches any database directly. They receive trait objects and call their methods. The `storage` crate defines these traits with no database dependencies, and provides backend-agnostic utilities in `storage::util` (ARN construction, partition/sort key parsing, netstring encoding) that any backend can reuse. The `storage-postgres` crate implements the traits. The `bin` crate is the wiring layer that creates concrete PostgreSQL stores and passes them to the server. +The key architectural principle: neither the `engine` nor the `server` crate touches any database directly. They receive trait objects and call their methods. The `storage` crate defines these traits with no database dependencies, and provides backend-agnostic utilities in `storage::util` (ARN construction, partition/sort key parsing, netstring encoding) that any backend can reuse. Backend crates implement the traits. The `bin` crate is the wiring layer that creates concrete stores and passes them to the server. ## Trait Overview @@ -33,7 +34,7 @@ A new backend must implement **13 storage traits** plus the `CredentialStore` tr 2. `DataEngine` — item CRUD, query, scan, transactions 3. `MetadataEngine` — TTL, tags, table statistics 4. `StreamEngine` — DynamoDB Streams -5. `WorkerStore` — background worker operations (GSI propagation, TTL cleanup) +5. `WorkerStore` — background worker operations (control-plane transitions, TTL cleanup) **Management and operational** (defined in `crates/storage/src/`): 6. `ManagementStore` — IAM CRUD (users, groups, roles, policies, access keys, accounts) @@ -48,7 +49,7 @@ A new backend must implement **13 storage traits** plus the `CredentialStore` tr **Additionally**, the `auth` crate defines: 14. `CredentialStore` — access key and session credential lookup for SigV4 verification -Backends register at compile time using the `inventory` crate and are selected by name at startup. The `RuntimeHooks` trait allows backends to spawn backend-specific workers (PostgreSQL spawns 7). +Backends register at compile time using the `inventory` crate and are selected by name at startup. The `RuntimeHooks` trait allows backends to spawn backend-specific workers. ## DynamoDB Data Path Traits @@ -64,14 +65,14 @@ Table lifecycle operations: | `delete_table` | Delete a table and all its data | | `describe_table` | Return full table metadata (status, key schema, indexes, size, item count) | | `list_tables` | Paginated list of table names for an account | -| `update_table` | Modify billing mode, throughput, deletion protection | +| `update_table` | Modify billing mode, throughput, stream specification, deletion protection, and GSI create/delete | | `table_key_info` | Lightweight metadata fetch (key schema, attribute definitions) for data ops | | `index_info` | Fetch metadata for a specific secondary index | Key design decisions: -- Tables have a lifecycle: CREATING → ACTIVE → DELETING → (gone). The `control_plane_delay_seconds` setting controls how long tables stay in CREATING before becoming ACTIVE. +- Tables have a lifecycle: CREATING → ACTIVE → UPDATING → ACTIVE → DELETING → (gone). The `control_plane_delay_seconds` setting controls how long tables stay in CREATING before becoming ACTIVE. - Tables are scoped by `account_id`. Multi-tenancy is a first-class concern. -- GSI creation can be asynchronous (CREATING → ACTIVE) with a configurable propagation delay. +- GSI creation is a control-plane transition. Backends persist the catalog intent first, then reconcile data artifacts from that durable state. TiDB uses this path for crash-safe GSI backfill while keeping ordinary GSI writes transactional. ### DataEngine @@ -116,7 +117,7 @@ TTL, tags, and table statistics: | `all_active_tables` | List active tables (all accounts) | Key design decisions: -- TTL deletion is a background process. The engine calls `find_expired_items` periodically, then deletes each item via `DataEngine::delete_item` (which handles index sync and stream capture). +- TTL deletion is backend-specific. Backends may use an indexed worker path, or a native database TTL feature when stream REMOVE records are not required. When a worker deletes expired items, it must call `DataEngine::delete_item` so index sync and stream capture remain correct. - Tags are stored by ARN string. - Table size refresh is a background operation that counts rows and sums sizes. @@ -148,9 +149,7 @@ Background worker operations: | Method | Purpose | |--------|---------| -| `activate_pending_tables` | Transition tables from CREATING to ACTIVE after delay | -| `activate_pending_gsis` | Transition GSIs from CREATING to ACTIVE after delay | -| `process_gsi_queue` | Process queued GSI index writes | +| `process_control_plane_transitions` | Recover and advance pending table lifecycle transitions | ## Management and Operational Traits @@ -297,7 +296,7 @@ Schema is managed via SQL migration files in `crates/storage-postgres/migrations | `003_auth.sql` | Full IAM schema (accounts, users, groups, roles, policies, access keys, sessions) | | `004_account_cascade.sql` | CASCADE constraints for account deletion | | `005_idempotency_tokens.sql` | idempotency_tokens table | -| `006_gsi_propagation_delay.sql` | GSI async propagation support | +| `006_gsi_consistency.sql` | Backend-specific GSI consistency metadata | | `007_stream_sequence.sql` | Stream sequence number generation | | `008_metrics.sql` | metrics_history table | | `009_login_attempts.sql` | login_attempts table for rate limiting | @@ -407,21 +406,21 @@ The PostgreSQL implementation makes backend-specific choices. These are implemen ## Summary of Traits and Implementations -| Trait | Defined In | PostgreSQL Implementation | Purpose | -|-------|---------------------------------------|--------------------------|---------| -| `TableEngine` | `storage/src/lib.rs` | `PostgresEngine` | Table lifecycle | -| `DataEngine` | `storage/src/lib.rs` | `PostgresEngine` | Item CRUD, query, scan, transactions | -| `MetadataEngine` | `storage/src/lib.rs` | `PostgresEngine` | TTL, tags, table statistics | -| `StreamEngine` | `storage/src/lib.rs` | `PostgresEngine` | DynamoDB Streams | -| `WorkerStore` | `storage/src/lib.rs` | `PostgresEngine` | Background workers | -| `BackupEngine` | `storage/src/lib.rs` | `PostgresEngine` | Backup and restore | -| `ManagementStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | IAM CRUD | -| `AdminStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | Admin users | -| `SettingsStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | Runtime settings | -| `MetricsStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | Historical metrics | -| `RateLimitStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | Login rate limiting | -| `AuthorizationStore` | `storage/src/authorization_store.rs` | `PostgresCatalogStore` | Policy lookups | -| `Bootstrapper` | `storage/src/bootstrapper.rs` | `PostgresBootstrapper` | Init, destroy, migrate | -| `CredentialStore` | `auth/src/lib.rs` | `DbCredentialStore` | SigV4 credential lookup | - -All PostgreSQL-specific code lives in `crates/storage-postgres/`. The `server` crate has no direct database dependencies. Backends register at compile time via the `inventory` crate and are selected by name at startup. +| Trait | Defined In | PostgreSQL Implementation | TiDB Implementation | Purpose | +|-------|------------|---------------------------|---------------------|---------| +| `TableEngine` | `storage/src/lib.rs` | `PostgresEngine` | `TidbEngine` | Table lifecycle | +| `DataEngine` | `storage/src/lib.rs` | `PostgresEngine` | `TidbEngine` | Item CRUD, query, scan, transactions | +| `MetadataEngine` | `storage/src/lib.rs` | `PostgresEngine` | `TidbEngine` | TTL, tags, table statistics | +| `StreamEngine` | `storage/src/lib.rs` | `PostgresEngine` | `TidbEngine` | DynamoDB Streams | +| `WorkerStore` | `storage/src/lib.rs` | `PostgresEngine` | `TidbEngine` | Background workers | +| `BackupEngine` | `storage/src/lib.rs` | `PostgresEngine` | `TidbEngine` | Backup and restore | +| `ManagementStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | `TidbCatalogStore` | IAM CRUD | +| `AdminStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | `TidbCatalogStore` | Admin users | +| `SettingsStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | `TidbCatalogStore` | Runtime settings | +| `MetricsStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | `TidbCatalogStore` | Historical metrics | +| `RateLimitStore` | `storage/src/management_store/mod.rs` | `PostgresCatalogStore` | `TidbCatalogStore` | Login rate limiting | +| `AuthorizationStore` | `storage/src/authorization_store.rs` | `PostgresCatalogStore` | `TidbCatalogStore` | Policy lookups | +| `Bootstrapper` | `storage/src/bootstrapper.rs` | `PostgresBootstrapper` | `TidbBootstrapper` | Init, destroy, migrate | +| `CredentialStore` | `auth/src/lib.rs` | `DbCredentialStore` | `DbCredentialStore` | SigV4 credential lookup | + +Backend-specific code lives in backend crates such as `crates/storage-postgres/` and `crates/storage-tidb/`. The `server` crate has no direct database dependencies. Backends register at compile time via the `inventory` crate and are selected by name at startup. diff --git a/docs/technical-debt.md b/docs/technical-debt.md index 4b81913..87e9524 100755 --- a/docs/technical-debt.md +++ b/docs/technical-debt.md @@ -36,7 +36,7 @@ Last updated: 2026-05-04 (P112) | # | Item | Location | Priority | Origin | |---|------|----------|----------|--------| | C-1 | `--catalog-db` should be `Optional` for `init` without full config | `bin/cmd_init.rs:113` | Low | P1 | -| C-2 | Storage backend config field unused (always "postgres") | `bin/config.rs:55` | Low | P1 | +| C-2 | ~~Storage backend config field unused (always "postgres")~~ | ~~`bin/config.rs:55`~~ | ~~Low~~ | P1 | | C-3 | ~~GSI error matching via English substring `"does not exist"`~~ — now uses SQLSTATE `42P01` | `storage-postgres/gsi_queue.rs` | ~~Medium~~ | P25 | | C-4 | BigDecimal parsed on every comparison in expression evaluator | `core/expression/evaluator.rs:112` | Low | P4 | | C-5 | Stream shard list not cached per table (extra SQL round-trip per write) | `storage-postgres/lib.rs:1844` | Low | P10 | @@ -50,7 +50,7 @@ Last updated: 2026-05-04 (P112) | # | Item | Location | Priority | Origin | |---|------|----------|----------|--------| -| S-1 | Multi-instance safety: no lease table prevents multiple extenddb instances sharing one PostgreSQL database | — | High | P25 review | +| S-1 | Multi-instance safety: no lease table prevents multiple extenddb instances sharing one catalog database | — | High | P25 review | | S-2 | `--password` flag visible in process listings (`ps aux`) | `bin/cmd_manage.rs` | Low | P12b | | S-3 | Release tarballs not GPG-signed — blocked on key ownership, public key distribution, CI secrets management | `devtools/build-release` | Medium | P28 | diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 9162ad9..8a76a2c 100755 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -41,7 +41,7 @@ See `docs/local-postgres-setup.md` for full setup instructions. ### `migration failed: ...` -**Cause:** The PostgreSQL database exists but the migration SQL failed (permissions, schema conflicts, etc.). +**Cause:** The storage database exists but the migration SQL failed (permissions, schema conflicts, etc.). **Fix:** Check the PostgreSQL logs (`~/pgdata/server.log`). Ensure the `extenddb` user has CREATE TABLE permissions on the `extenddb` database. @@ -59,7 +59,7 @@ See `docs/local-postgres-setup.md` for full setup instructions. ### `Database '' already exists. Run 'extenddb destroy --config ' first, then re-run 'extenddb init'.` -**Cause:** `extenddb init` detected that the catalog or data database already exists in PostgreSQL. To prevent accidental data loss, `extenddb init` refuses to proceed when either database is present. +**Cause:** `extenddb init` detected that the catalog or data database already exists in the configured storage backend. To prevent accidental data loss, `extenddb init` refuses to proceed when either database is present. **Fix:** If you want to re-initialize from scratch, run `extenddb destroy --config extenddb.toml` first to drop both databases, then run `extenddb init` again. If you want to keep the existing data and just apply migrations, use `extenddb migrate` instead. @@ -157,7 +157,7 @@ journalctl -t extenddb -f **Cause:** The background task that polls the `log_level` setting from the database could not connect. The server continues to run with the initial log level from the config file. -**Fix:** Verify the `connection_string` in `extenddb.toml` is correct and PostgreSQL is reachable. The log level can still be set via the config file; runtime changes via `extenddb settings set log_level` will not take effect until the server is restarted. +**Fix:** Verify the `connection_string` in `extenddb.toml` is correct and the configured storage backend is reachable. The log level can still be set via the config file; runtime changes via `extenddb settings set log_level` will not take effect until the server is restarted. ### `Invalid log_level '' in settings: ` @@ -278,13 +278,13 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin ### SDK timeout errors -**Cause:** extenddb is running but slow to respond (e.g., PostgreSQL connection pool exhausted). +**Cause:** extenddb is running but slow to respond (e.g., storage backend connection pool exhausted). -**Fix:** Check `extenddb.toml` `[storage.postgres] pool_size` — increase if under heavy concurrent load. Check PostgreSQL logs for slow queries. +**Fix:** Check `extenddb.toml` `pool_size` under the active storage section (`[storage.postgres]` or `[storage.tidb]`) and increase it if under heavy concurrent load. Check backend logs for slow queries. -### Table stuck in CREATING or DELETING state +### Table stuck in CREATING, UPDATING, or DELETING state -**Cause:** The background transition poller processes status changes when notified by CreateTable/DeleteTable, or on a 60-second defensive sweep. If the server was stopped while a table was in a transitional state, the transition completes on the next server startup. +**Cause:** The background transition poller processes status changes when notified by CreateTable, UpdateTable, or DeleteTable, or on a 60-second defensive sweep. If the server was stopped while a table was in a transitional state, the transition completes on the next server startup. **Fix:** If a table appears stuck: 1. Check that extenddb is running (`extenddb status`). @@ -299,15 +299,15 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin ### Failed to recover control plane transitions -**Cause:** At startup, extenddb attempts to complete any in-flight control plane transitions (CREATING→ACTIVE, DELETING→removed) left over from a previous server instance. This error means the recovery query failed, likely due to a database connectivity issue. +**Cause:** At startup, extenddb attempts to complete any in-flight control plane transitions (CREATING→ACTIVE, UPDATING→ACTIVE, DELETING→removed) left over from a previous server instance. This error means the recovery query failed, likely due to a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and that the catalog database is accessible. Tables may be stuck in CREATING or DELETING state until the issue is resolved. Once the database is reachable, restart extenddb to retry recovery. +**Fix:** Check database connectivity and that the catalog database is accessible. Tables may be stuck in CREATING, UPDATING, or DELETING state until the issue is resolved. Once the database is reachable, restart extenddb to retry recovery. ### Control plane transition poll failed -**Cause:** The background poller that processes CREATING→ACTIVE and DELETING→removed transitions encountered a database error. Tables in transitional states will remain stuck until the poller succeeds. +**Cause:** The background poller that processes CREATING→ACTIVE, UPDATING→ACTIVE, and DELETING→removed transitions encountered a database error. Tables in transitional states will remain stuck until the poller succeeds. -**Fix:** Check PostgreSQL connectivity. The poller retries on the next wake (triggered by new CreateTable/DeleteTable requests or the 60-second defensive sweep). If the database is healthy and the error persists, check PostgreSQL logs for details. +**Fix:** Check database connectivity. The poller retries on the next wake (triggered by new control-plane requests or the 60-second defensive sweep). If the database is healthy and the error persists, check backend database logs for details. ## Management API @@ -315,7 +315,7 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** The management API could not query the `admin_users` table to verify credentials. The database may be unreachable or the catalog schema may be corrupted. -**Fix:** Check PostgreSQL connectivity and that the `admin_users` table exists in the catalog database. Run `extenddb verify --config extenddb.toml` to check catalog health. +**Fix:** Check storage backend connectivity and that the `admin_users` table exists in the catalog database. Run `extenddb verify --config extenddb.toml` to check catalog health. ### `Management API: bcrypt hash failed: ` @@ -327,121 +327,121 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** The `INSERT INTO admin_users` query failed for a reason other than a unique constraint violation (which returns 409 Conflict). Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: list admins failed: ` **Cause:** The `SELECT FROM admin_users` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete admin failed: ` **Cause:** The `DELETE FROM admin_users` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: change password failed: ` **Cause:** The `UPDATE admin_users` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: create account failed: ` **Cause:** The `INSERT INTO accounts` query failed for a reason other than a unique constraint violation (which returns 409 Conflict). Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: list accounts failed: ` **Cause:** The `SELECT FROM accounts` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: check tables failed: ` **Cause:** During account deletion, the query to check whether the account owns tables failed. The delete was not attempted. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete account failed: ` **Cause:** The `DELETE FROM accounts` query failed. Likely a database connectivity issue or an unexpected FK constraint violation. -**Fix:** Check PostgreSQL connectivity and logs. If the error mentions a foreign key violation, ensure all IAM entities for the account have been cleaned up (this should happen automatically via CASCADE). +**Fix:** Check storage backend connectivity and logs. If the error mentions a foreign key violation, ensure all IAM entities for the account have been cleaned up (this should happen automatically via CASCADE). ### `Management API: begin transaction failed: ` **Cause:** The management API could not start a database transaction. Likely a database connectivity or pool exhaustion issue. -**Fix:** Check PostgreSQL connectivity. If the management pool (2 connections) is exhausted, wait and retry. +**Fix:** Check storage backend connectivity. If the management pool (2 connections) is exhausted, wait and retry. ### `Management API: commit delete account failed: ` **Cause:** The account deletion succeeded but the transaction commit failed. The deletion was rolled back. Likely a database connectivity issue. -**Fix:** Retry the operation. Check PostgreSQL connectivity and logs. +**Fix:** Retry the operation. Check storage backend connectivity and logs. ### `Management API: DB error during IAM user auth: ` **Cause:** The management API could not query the `iam_users` table to verify IAM user credentials. The database may be unreachable or the catalog schema may be corrupted. -**Fix:** Check PostgreSQL connectivity and that the `iam_users` table exists in the catalog database. Run `extenddb verify --config extenddb.toml` to check catalog health. +**Fix:** Check storage backend connectivity and that the `iam_users` table exists in the catalog database. Run `extenddb verify --config extenddb.toml` to check catalog health. ### `Management API: create IAM user failed: ` **Cause:** The `INSERT INTO iam_users` query failed for a reason other than a unique constraint or FK violation. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: seed self-service policy failed: ` **Cause:** After creating an IAM user, the default self-service policy could not be inserted. The user was created successfully but may lack the default policy. -**Fix:** Manually attach a self-service policy using `extenddb manage put-user-policy`. Check PostgreSQL connectivity. +**Fix:** Manually attach a self-service policy using `extenddb manage put-user-policy`. Check storage backend connectivity. ### `Management API: list IAM users failed: ` **Cause:** The `SELECT FROM iam_users` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete IAM user failed: ` **Cause:** The `DELETE FROM iam_users` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: tag IAM user failed: ` **Cause:** The `INSERT INTO iam_user_tags` query failed for a reason other than a FK violation. Likely a database connectivity issue. The tag transaction is rolled back — no partial tags are applied. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: commit tag transaction failed: ` **Cause:** All tag upserts succeeded but the transaction commit failed. Tags were rolled back. Likely a database connectivity issue. -**Fix:** Retry the operation. Check PostgreSQL connectivity and logs. +**Fix:** Retry the operation. Check storage backend connectivity and logs. ### `Management API: untag IAM user failed: ` **Cause:** The `DELETE FROM iam_user_tags` query failed. The untag transaction is rolled back — no partial deletes are applied. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: commit untag transaction failed: ` **Cause:** All tag deletions succeeded but the transaction commit failed. Deletions were rolled back. Likely a database connectivity issue. -**Fix:** Retry the operation. Check PostgreSQL connectivity and logs. +**Fix:** Retry the operation. Check storage backend connectivity and logs. ### `Management API: list IAM user tags failed: ` **Cause:** The `SELECT FROM iam_user_tags` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: encryption key not found in settings` @@ -453,7 +453,7 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** The query to retrieve the encryption key from the `settings` table failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: encrypt secret key failed: ` @@ -465,31 +465,31 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** The `INSERT INTO access_keys` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: list access keys failed: ` **Cause:** The `SELECT FROM access_keys` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete access key failed: ` **Cause:** The `DELETE FROM access_keys` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: change IAM user password failed: ` **Cause:** The `UPDATE iam_users` query to change the password failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: check allow_credential_import failed: ` **Cause:** The query to check the `allow_credential_import` runtime setting failed during an access key import. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. Retry the import operation. +**Fix:** Check storage backend connectivity and logs. Retry the import operation. ### `Management API: encrypt imported secret failed: ` @@ -501,73 +501,73 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** The `INSERT INTO access_keys` query failed during an access key import for a reason other than a FK or unique constraint violation. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: create IAM group failed: ` **Cause:** The `INSERT INTO iam_groups` query failed for a reason other than a unique constraint or FK violation. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: list IAM groups failed: ` **Cause:** The `SELECT FROM iam_groups` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete IAM group failed: ` **Cause:** The `DELETE FROM iam_groups` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: add group member failed: ` **Cause:** The `INSERT INTO iam_group_members` query failed for a reason other than a unique constraint or FK violation. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: remove group member failed: ` **Cause:** The `DELETE FROM iam_group_members` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: put user policy failed: ` **Cause:** The `INSERT INTO iam_policies` query failed for a user policy for a reason other than a FK violation. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: put group policy failed: ` **Cause:** The `INSERT INTO iam_policies` query failed for a group policy for a reason other than a FK violation. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: list user policies failed: ` **Cause:** The `SELECT FROM iam_policies` query failed for user policies. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: list group policies failed: ` **Cause:** The `SELECT FROM iam_policies` query failed for group policies. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete user policy failed: ` **Cause:** The `DELETE FROM iam_policies` query failed for a user policy. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete group policy failed: ` **Cause:** The `DELETE FROM iam_policies` query failed for a group policy. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ## IAM Role Management @@ -575,55 +575,55 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** The `INSERT INTO iam_roles` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: list IAM roles failed: ` **Cause:** The `SELECT` query for listing IAM roles failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete IAM role failed: ` **Cause:** The `DELETE FROM iam_roles` query failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: tag IAM role failed: ` **Cause:** The `INSERT INTO iam_role_tags` query failed during a tag operation. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: untag IAM role failed: ` **Cause:** The `DELETE FROM iam_role_tags` query failed during an untag operation. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: list IAM role tags failed: ` **Cause:** The `SELECT` query for listing IAM role tags failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: put role policy failed: ` **Cause:** The `INSERT INTO iam_policies` query failed for a role policy. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: list role policies failed: ` **Cause:** The `SELECT` query for listing role policies failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete role policy failed: ` **Cause:** The `DELETE FROM iam_policies` query failed for a role policy. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ## AssumeRole @@ -631,7 +631,7 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** The `SELECT` query to load the role and its trust policy failed during an AssumeRole operation. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: encrypt session secret failed: ` @@ -643,7 +643,7 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** The `INSERT INTO iam_sessions` query failed when storing the temporary session. Likely a database connectivity issue or a unique constraint violation on the generated access key ID (extremely unlikely). -**Fix:** Check PostgreSQL connectivity and logs. Retry the assume-role operation. +**Fix:** Check storage backend connectivity and logs. Retry the assume-role operation. ## Permissions Boundaries @@ -651,37 +651,37 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** The `INSERT INTO iam_permissions_boundaries` query failed for a user boundary. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: get user permissions boundary failed: ` **Cause:** The `SELECT` query for a user permissions boundary failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete user permissions boundary failed: ` **Cause:** The `DELETE FROM iam_permissions_boundaries` query failed for a user boundary. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: set role permissions boundary failed: ` **Cause:** The `INSERT INTO iam_permissions_boundaries` query failed for a role boundary. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: get role permissions boundary failed: ` **Cause:** The `SELECT` query for a role permissions boundary failed. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ### `Management API: delete role permissions boundary failed: ` **Cause:** The `DELETE FROM iam_permissions_boundaries` query failed for a role boundary. Likely a database connectivity issue. -**Fix:** Check PostgreSQL connectivity and logs. +**Fix:** Check storage backend connectivity and logs. ## DynamoDB Streams @@ -689,25 +689,25 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Cause:** After a successful write (PutItem, DeleteItem, UpdateItem), extenddb tried to capture a stream record but could not determine which shard to assign it to. The data write succeeded — only the stream record is missing. -**Fix:** Check PostgreSQL connectivity. Verify the table's stream shards exist in the `stream_shards` table. If the table was created before streams were enabled, the shards may not have been initialized. +**Fix:** Check storage backend connectivity. Verify the table's stream shards exist in the `stream_shards` table. If the table was created before streams were enabled, the shards may not have been initialized. ### `Stream capture: failed to write record for : ` **Cause:** A stream record was constructed but could not be persisted to the `stream_records` table. The data write succeeded — only the stream record is missing. -**Fix:** Check PostgreSQL connectivity and disk space. If the error mentions a unique constraint violation, two writes to the same shard may have occurred in the same microsecond — retry the operation. +**Fix:** Check storage backend connectivity and disk space. If the error mentions a unique constraint violation, two writes to the same shard may have occurred in the same microsecond — retry the operation. ### `Stream capture: failed to get sequence number: ` **Cause:** extenddb could not generate a sequence number for a stream record. The data write succeeded — only the stream record is missing. -**Fix:** Check PostgreSQL connectivity. +**Fix:** Check storage backend connectivity. ### `Stream cleanup worker: ` **Cause:** The background worker that deletes stream records older than 24 hours encountered a database error. Expired records will accumulate until the worker succeeds. -**Fix:** Check PostgreSQL connectivity. The worker retries every hour automatically. +**Fix:** Check storage backend connectivity. The worker retries every hour automatically. ## Management Console Errors @@ -761,27 +761,31 @@ If the health check fails, start extenddb. If it succeeds, check your `--endpoin **Fix:** Use the full ARN format: `arn:aws:dynamodb:::table/`. You can get the ARN from `DescribeTable`. -## GSI Async Update Behavior +## PostgreSQL GSI Async Update Behavior ### GSI query returns stale data after a write -**Cause:** GSI updates are applied asynchronously with a configurable propagation delay (default 10ms). This matches real DynamoDB's eventually consistent GSI behavior. Each GSI can have its own `propagation_delay_ms` setting; the system-wide default is controlled by the `gsi_propagation_delay_ms` runtime setting. +**Cause:** On the PostgreSQL backend, GSI updates can be applied asynchronously with a configurable propagation delay (default 10ms). TiDB does not use this path; TiDB maintains native secondary indexes from the base table row. -**Fix:** This is expected behavior. For tests that query GSIs after writes, poll/retry the GSI query until the expected data appears. To make all GSIs synchronous for testing, set `extenddb settings set gsi_propagation_delay_ms 0`. For production-like testing, keep the default async delay. +**Fix:** For PostgreSQL tests that query GSIs immediately after writes, poll/retry the GSI query or set `extenddb settings set gsi_propagation_delay_ms 0`. No setting is needed for TiDB. ## Connection Pool Exhaustion ### HTTP 500 on all requests under heavy load -**Cause:** The PostgreSQL connection pool is exhausted. All connections are in use and new requests cannot acquire a connection within the timeout. extenddb currently returns HTTP 500 (Internal Server Error) instead of the more appropriate 503 (Service Unavailable). +**Cause:** The storage backend connection pool is exhausted. All connections are in use and new requests cannot acquire a connection within the timeout. extenddb currently returns HTTP 500 (Internal Server Error) instead of the more appropriate 503 (Service Unavailable). **Fix:** Increase the pool size in `extenddb.toml`: ```toml [storage.postgres] pool_size = 50 # default is 20 + +# or, for TiDB: +[storage.tidb] +pool_size = 50 ``` -If the problem persists, check for long-running queries or connection leaks with `SELECT * FROM pg_stat_activity WHERE datname = 'extenddb_data';`. +If the problem persists, check for long-running queries or connection leaks with the backend's session-inspection tools, such as PostgreSQL `pg_stat_activity` or TiDB's statement/cluster diagnostics. **Known limitation:** The HTTP status code should be 503 with a `Retry-After` header. This is tracked as technical debt. diff --git a/extenddb.sample.toml b/extenddb.sample.toml index 757bb9d..0aa03c8 100755 --- a/extenddb.sample.toml +++ b/extenddb.sample.toml @@ -14,6 +14,7 @@ # Environment variable overrides use the EXTENDDB__ prefix with __ as separator: # EXTENDDB__SERVER__PORT=9000 # EXTENDDB__STORAGE__POSTGRES__CONNECTION_STRING="postgresql://..." +# EXTENDDB__STORAGE__TIDB__CONNECTION_STRING="mysql://..." # Path to the rendered documentation directory (HTML + PDF files). # Generated by `python3 docs/build-docs.py`. When set, the web console @@ -29,7 +30,7 @@ # run_dir = "~/.extenddb/run" # Directory for PID file (~ is expanded to $HOME) [storage] -# backend = "postgres" # Storage backend (only "postgres" supported) +# backend = "postgres" # Storage backend: "postgres" or "tidb" [storage.postgres] # Connection string points to the CATALOG database. @@ -52,6 +53,26 @@ # DynamoDB request makes concurrent authz queries # — size this to match expected concurrency. +[storage.tidb] +# Optional TiDB backend. Build or run the binary with `--features tidb`, then set: +# [storage] +# backend = "tidb" +# Connection string points to the CATALOG database. TiDB listens on port 4000 by default. +# The `tidb://` scheme is accepted in config and converted to `mysql://` for sqlx. +# connection_string = "mysql://extenddb:extenddb-local-dev@localhost:4000/extenddb_catalog" +# pool_size = 20 +# catalog_pool_size = 20 +# +# TiDB native backup/restore uses BR. SQL BACKUP/RESTORE is experimental in +# TiDB, so ExtendDB shells out to BR when these fields are configured. +# [storage.tidb.backup] +# pd_endpoint = "127.0.0.1:2379" +# storage_uri = "local:///var/lib/extenddb/tidb-backups" +# log_storage_uri is reserved for future cluster-level BR log backup orchestration. +# binary = "tiup" # default; set to "br" when BR is installed directly +# component = "br" # default; set to "" when binary points to BR itself +# send_credentials_to_tikv = false + [auth] # provider = "builtin" # Auth provider: # "builtin" — SigV4 verification with local credential @@ -135,6 +156,16 @@ # pool_size = 50 # catalog_pool_size = 50 # +# Or, for TiDB builds: +# +# [storage] +# backend = "tidb" +# +# [storage.tidb] +# connection_string = "mysql://extenddb:@tidb.example.com:4000/extenddb_catalog" +# pool_size = 50 +# catalog_pool_size = 50 +# # [auth] # provider = "builtin" # diff --git a/scripts/install-linux.sh b/scripts/install-linux.sh index 003af16..b3b9dee 100755 --- a/scripts/install-linux.sh +++ b/scripts/install-linux.sh @@ -139,7 +139,7 @@ echo " $PDF_DIR/" echo echo "Next steps:" echo " 1. Ensure PostgreSQL is running: pg_isready" -echo " 2. Initialize: extenddb init --catalog-db extenddb_catalog --pg-user postgres" +echo " 2. Initialize: extenddb init --catalog-db extenddb_catalog --storage-admin-user postgres" echo " 3. Verify: extenddb verify --config extenddb.toml" echo " 4. Start: extenddb serve --config extenddb.toml" echo diff --git a/scripts/install-macos.sh b/scripts/install-macos.sh index 6cd71ce..b0fe6f6 100755 --- a/scripts/install-macos.sh +++ b/scripts/install-macos.sh @@ -139,7 +139,7 @@ echo " $PDF_DIR/" echo echo "Next steps:" echo " 1. Ensure PostgreSQL is running: pg_isready" -echo " 2. Initialize: extenddb init --catalog-db extenddb_catalog --pg-user $(whoami)" +echo " 2. Initialize: extenddb init --catalog-db extenddb_catalog --storage-admin-user $(whoami)" echo " 3. Verify: extenddb verify --config extenddb.toml" echo " 4. Start: extenddb serve --config extenddb.toml" echo diff --git a/tests/cli/test-cli-comprehensive.sh b/tests/cli/test-cli-comprehensive.sh index 9621e9b..05fd96a 100755 --- a/tests/cli/test-cli-comprehensive.sh +++ b/tests/cli/test-cli-comprehensive.sh @@ -1139,7 +1139,7 @@ $EXTENDDB settings list --config "$CONFIG" >/dev/null 2>&1; RC=$? assert_fail $RC "settings with missing config fails" # Init with invalid PG credentials -$EXTENDDB init --config "$CONFIG" --overwrite --pg-user nonexistent_user --pg-host localhost >/dev/null 2>&1; RC=$? +$EXTENDDB init --config "$CONFIG" --overwrite --storage-admin-user nonexistent_user --storage-host localhost >/dev/null 2>&1; RC=$? assert_fail $RC "init with bad PG user fails" rm -f "$CONFIG" diff --git a/tests/rust/src/backup_restore.rs b/tests/rust/src/backup_restore.rs index 65b2f24..996dc89 100755 --- a/tests/rust/src/backup_restore.rs +++ b/tests/rust/src/backup_restore.rs @@ -34,6 +34,57 @@ async fn make_table(name: &str) { wait_for_active(c, name).await; } +async fn make_range_table(name: &str, sk_type: ScalarAttributeType) { + let c = client(); + c.create_table() + .table_name(name) + .key_schema( + KeySchemaElement::builder() + .attribute_name("pk") + .key_type(KeyType::Hash) + .build() + .unwrap(), + ) + .key_schema( + KeySchemaElement::builder() + .attribute_name("sk") + .key_type(KeyType::Range) + .build() + .unwrap(), + ) + .attribute_definitions( + AttributeDefinition::builder() + .attribute_name("pk") + .attribute_type(ScalarAttributeType::S) + .build() + .unwrap(), + ) + .attribute_definitions( + AttributeDefinition::builder() + .attribute_name("sk") + .attribute_type(sk_type) + .build() + .unwrap(), + ) + .billing_mode(BillingMode::PayPerRequest) + .send() + .await + .unwrap(); + wait_for_active(c, name).await; +} + +fn sort_value( + sk_type: &ScalarAttributeType, + index: i64, +) -> aws_sdk_dynamodb::types::AttributeValue { + match sk_type { + ScalarAttributeType::S => s(&format!("sort_{index}")), + ScalarAttributeType::N => n(index), + ScalarAttributeType::B => b(&format!("sort_{index}")), + other => panic!("unexpected sort key type: {other:?}"), + } +} + /// Create a backup, retrying on `ContinuousBackupsUnavailableException` /// (real DynamoDB needs time for continuous backups to initialize). async fn create_backup_with_retry( @@ -213,6 +264,62 @@ async fn restore_table_from_backup() { c.delete_table().table_name(&table).send().await.ok(); } +#[tokio::test] +async fn restore_table_from_backup_with_sort_keys() { + let c = client(); + let cases = [ + ScalarAttributeType::S, + ScalarAttributeType::N, + ScalarAttributeType::B, + ]; + + for sk_type in cases { + let table = format!("RestoreRange_{sk_type:?}_{}", ts()); + make_range_table(&table, sk_type.clone()).await; + + for i in 0..3 { + c.put_item() + .table_name(&table) + .item("pk", s("partition")) + .item("sk", sort_value(&sk_type, i)) + .item("data", s(&format!("val_{i}"))) + .send() + .await + .unwrap(); + } + + let create = create_backup_with_retry(c, &table, "restore-range-backup").await; + let arn = create.backup_details().unwrap().backup_arn().to_string(); + + let restored = format!("RestoredRange_{sk_type:?}_{}", ts()); + c.restore_table_from_backup() + .target_table_name(&restored) + .backup_arn(&arn) + .send() + .await + .unwrap(); + + wait_for_active(c, &restored).await; + let scan = c.scan().table_name(&restored).send().await.unwrap(); + assert_eq!(scan.count(), 3); + + let resp = c + .get_item() + .table_name(&restored) + .key("pk", s("partition")) + .key("sk", sort_value(&sk_type, 1)) + .consistent_read(true) + .send() + .await + .unwrap(); + let item = resp.item().expect("restored range-key item exists"); + assert_eq!(item.get("data"), Some(&s("val_1"))); + + c.delete_table().table_name(&restored).send().await.ok(); + c.delete_table().table_name(&table).send().await.ok(); + } +} + #[tokio::test] async fn describe_continuous_backups() { let c = client(); @@ -282,7 +389,10 @@ async fn restore_table_to_point_in_time() { .use_latest_restorable_time(true) .send() .await; - assert!(err.is_err(), "RestoreTableToPointInTime should return an error (not yet supported)"); + assert!( + err.is_err(), + "RestoreTableToPointInTime should return an error (not yet supported)" + ); c.delete_table().table_name(&table).send().await.ok(); } diff --git a/tests/test_cli_lifecycle.py b/tests/test_cli_lifecycle.py index 7dd4af3..1f2694c 100755 --- a/tests/test_cli_lifecycle.py +++ b/tests/test_cli_lifecycle.py @@ -78,12 +78,12 @@ def _fail_if_no_binary(): def _pg_args(): - """Return --pg-user and --pg-pass args for commands that connect as admin.""" + """Return --storage-admin-user and --storage-admin-password args for commands that connect as admin.""" args = [] if PG_USER: - args.extend(["--pg-user", PG_USER]) + args.extend(["--storage-admin-user", PG_USER]) if PG_PASS: - args.extend(["--pg-pass", PG_PASS]) + args.extend(["--storage-admin-password", PG_PASS]) return args @@ -106,8 +106,8 @@ def _patch_config_port(config_path, port): def _init_args(cli_env): """Return CLI args for extenddb init including all connection details.""" args = list(_pg_args()) - args.extend(["--pg-host", cli_env["pg_host"]]) - args.extend(["--pg-port", cli_env["pg_port"]]) + args.extend(["--storage-host", cli_env["pg_host"]]) + args.extend(["--storage-port", cli_env["pg_port"]]) args.extend(["--catalog-db", cli_env["db_name"]]) if PG_USER: args.extend(["--extenddb-user", PG_USER])