From 6b16b3027337ef13f6b7eae5f62b0429c5a1bbb2 Mon Sep 17 00:00:00 2001 From: "michael.richey" Date: Thu, 23 Apr 2026 10:06:31 -0400 Subject: [PATCH 1/3] feat(sync): add --minimize-reads with type-scoped storage loading For large orgs (10,000+ resources), sync-cli loads all resource files from cloud storage even when syncing a small subset. This causes Phase 2 (roles) and Phase 3 (users) to take ~25 minutes each, leaving no time for Phase 6. Adds --minimize-reads flag (sync command only) that scopes storage reads to only the requested resource type(s). For --resources=roles with 10,000 total resources, this reduces reads from ~20,000 to ~100 (10x per backend per origin). Constraints: requires --resource-per-file and --resources; not compatible with --cleanup; sync command only. ID-targeted loading (for exact filter matches) added in the next PR. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- datadog_sync/commands/sync.py | 17 +- datadog_sync/utils/configuration.py | 18 ++ datadog_sync/utils/state.py | 5 +- tests/unit/test_minimize_reads_type_scoped.py | 191 ++++++++++++++++++ 4 files changed, 229 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_minimize_reads_type_scoped.py diff --git a/datadog_sync/commands/sync.py b/datadog_sync/commands/sync.py index 4f8e93e07..65bd5eaf2 100644 --- a/datadog_sync/commands/sync.py +++ b/datadog_sync/commands/sync.py @@ -3,9 +3,10 @@ # This product includes software developed at Datadog (https://www.datadoghq.com/). # Copyright 2019 Datadog, Inc. -from click import command +from click import command, option from datadog_sync.commands.shared.options import ( + CustomOptionClass, common_options, destination_auth_options, diffs_options, @@ -24,6 +25,20 @@ @diffs_options @sync_options @storage_options +@option( + "--minimize-reads", + required=False, + is_flag=True, + default=False, + show_default=True, + help="Minimize cloud storage reads by loading only needed resources. " + "Uses ID-targeted fetching when filters specify exact IDs, " + "otherwise falls back to type-scoped loading. " + "Requires --resource-per-file and --resources. " + "Must not be combined with --cleanup. " + "Only available on the sync command.", + cls=CustomOptionClass, +) def sync(**kwargs): """Sync Datadog resources to destination.""" run_cmd(Command.SYNC, **kwargs) diff --git a/datadog_sync/utils/configuration.py b/datadog_sync/utils/configuration.py index e5bbd99de..e389f3978 100644 --- a/datadog_sync/utils/configuration.py +++ b/datadog_sync/utils/configuration.py @@ -306,6 +306,20 @@ def build_config(cmd: Command, **kwargs: Optional[Any]) -> Configuration: ) resource_per_file = kwargs.get(RESOURCE_PER_FILE, False) + minimize_reads = kwargs.get("minimize_reads", False) + + # Validate --minimize-reads constraints + if minimize_reads: + if not resource_per_file: + raise click.UsageError("--minimize-reads requires --resource-per-file") + if not kwargs.get("resources", None): + raise click.UsageError("--minimize-reads requires --resources") + + # Determine type-scoped loading (ID-targeted loading added in PR 3) + _state_resource_types = None # None = full load (existing behavior) + if minimize_reads and (rs := kwargs.get("resources", None)): + _state_resource_types = [r.strip().lower() for r in rs.split(",") if r.strip()] + # Initialize state state = State( type_=storage_type, @@ -313,8 +327,12 @@ def build_config(cmd: Command, **kwargs: Optional[Any]) -> Configuration: destination_resources_path=destination_resources_path, config=config, resource_per_file=resource_per_file, + resource_types=_state_resource_types, # None = full load ) + if _state_resource_types is not None: + logger.info(f"minimize-reads: type-scoped loading for {_state_resource_types}") + # Initialize Configuration config = Configuration( logger=logger, diff --git a/datadog_sync/utils/state.py b/datadog_sync/utils/state.py index 85a9dd790..f88fce56e 100644 --- a/datadog_sync/utils/state.py +++ b/datadog_sync/utils/state.py @@ -22,6 +22,8 @@ class State: def __init__(self, type_: StorageType = StorageType.LOCAL_FILE, **kwargs: object) -> None: + self._resource_types = kwargs.get("resource_types", None) + self._minimize_reads = self._resource_types is not None resource_per_file = kwargs.get(RESOURCE_PER_FILE, False) source_resources_path = kwargs.get(SOURCE_PATH_PARAM, SOURCE_PATH_DEFAULT) destination_resources_path = kwargs.get(DESTINATION_PATH_PARAM, DESTINATION_PATH_DEFAULT) @@ -76,7 +78,8 @@ def destination(self): return self._data.destination def load_state(self, origin: Origin = Origin.ALL) -> None: - self._data = self._storage.get(origin) + # MODIFIED: resource_types param added. None = full load (existing behavior). + self._data = self._storage.get(origin, resource_types=self._resource_types) def dump_state(self, origin: Origin = Origin.ALL) -> None: self._storage.put(origin, self._data) diff --git a/tests/unit/test_minimize_reads_type_scoped.py b/tests/unit/test_minimize_reads_type_scoped.py new file mode 100644 index 000000000..65179b991 --- /dev/null +++ b/tests/unit/test_minimize_reads_type_scoped.py @@ -0,0 +1,191 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the 3-clause BSD style license (see LICENSE). +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2019 Datadog, Inc. + +"""Tests for PR 2: type-scoped loading via --minimize-reads flag (type-scoped strategy).""" + +import json +from pathlib import Path +from unittest.mock import MagicMock, call, patch + +import pytest +from click.testing import CliRunner + +from datadog_sync.cli import cli +from datadog_sync.constants import Origin +from datadog_sync.utils.storage._base_storage import StorageData +from datadog_sync.utils.storage.local_file import LocalFile +from datadog_sync.utils.storage.storage_types import StorageType + + +@pytest.fixture +def runner(): + return CliRunner(mix_stderr=True) + + +class TestStateTypeScopedLoading: + def test_state_loads_only_requested_types_when_type_scoped(self, tmp_path): + """State with resource_types only loads those types from storage.""" + src_path = str(tmp_path / "source") + dst_path = str(tmp_path / "dest") + Path(src_path).mkdir() + Path(dst_path).mkdir() + + # Write two resource types to storage + backend = LocalFile(source_resources_path=src_path, destination_resources_path=dst_path, resource_per_file=True) + data = StorageData() + data.source["roles"]["role-1"] = {"id": "role-1", "name": "Admin"} + data.source["dashboards"]["dash-1"] = {"id": "dash-1", "title": "My Dash"} + backend.put(Origin.SOURCE, data) + + # Load only roles + result = backend.get(Origin.SOURCE, resource_types=["roles"]) + assert "role-1" in result.source["roles"] + assert "dash-1" not in result.source["dashboards"], "dashboards should not be loaded" + + def test_state_loads_all_when_no_resource_types(self, tmp_path): + """Regression: no resource_types loads all (existing behavior unchanged).""" + src_path = str(tmp_path / "source") + dst_path = str(tmp_path / "dest") + Path(src_path).mkdir() + Path(dst_path).mkdir() + + backend = LocalFile(source_resources_path=src_path, destination_resources_path=dst_path, resource_per_file=True) + data = StorageData() + data.source["roles"]["role-1"] = {"id": "role-1"} + data.source["dashboards"]["dash-1"] = {"id": "dash-1"} + backend.put(Origin.SOURCE, data) + + result = backend.get(Origin.SOURCE, resource_types=None) + assert "role-1" in result.source["roles"] + assert "dash-1" in result.source["dashboards"] + + def test_state_minimize_reads_flag(self, tmp_path): + """State._minimize_reads is True when resource_types is set.""" + from datadog_sync.utils.state import State + + src_path = str(tmp_path / "source") + dst_path = str(tmp_path / "dest") + Path(src_path).mkdir() + Path(dst_path).mkdir() + + state = State( + type_=StorageType.LOCAL_FILE, + source_resources_path=src_path, + destination_resources_path=dst_path, + resource_types=["roles"], + ) + assert state._minimize_reads is True + + def test_state_no_minimize_reads_by_default(self, tmp_path): + """Regression: State._minimize_reads is False when resource_types is None.""" + from datadog_sync.utils.state import State + + src_path = str(tmp_path / "source") + dst_path = str(tmp_path / "dest") + Path(src_path).mkdir() + Path(dst_path).mkdir() + + state = State( + type_=StorageType.LOCAL_FILE, + source_resources_path=src_path, + destination_resources_path=dst_path, + ) + assert state._minimize_reads is False + + +class TestMinimizeReadsCLIValidation: + def test_minimize_reads_requires_resource_per_file(self, runner): + """--minimize-reads without --resource-per-file must fail with error.""" + result = runner.invoke( + cli, + [ + "sync", + "--minimize-reads", + "--resources=roles", + "--source-api-key=x", + "--destination-api-key=y", + ], + ) + assert result.exit_code != 0 + assert "resource-per-file" in (result.output + str(result.exception)).lower() + + def test_minimize_reads_requires_resources_flag(self, runner): + """--minimize-reads without --resources must fail with error.""" + result = runner.invoke( + cli, + [ + "sync", + "--minimize-reads", + "--resource-per-file", + "--source-api-key=x", + "--destination-api-key=y", + ], + ) + assert result.exit_code != 0 + assert "resources" in (result.output + str(result.exception)).lower() + + def test_minimize_reads_not_available_on_diffs_command(self, runner): + """--minimize-reads must not be accepted by the diffs command.""" + result = runner.invoke( + cli, + ["diffs", "--minimize-reads", "--source-api-key=x", "--destination-api-key=y"], + ) + assert result.exit_code != 0 + assert "no such option" in result.output.lower() + + def test_minimize_reads_not_available_on_import_command(self, runner): + """--minimize-reads must not be accepted by the import command.""" + result = runner.invoke( + cli, + ["import", "--minimize-reads", "--source-api-key=x"], + ) + assert result.exit_code != 0 + assert "no such option" in result.output.lower() + + +class TestS3TypeScopedGet: + def test_s3_get_type_scoped_uses_per_type_prefix(self): + """S3 get() with resource_types uses type-specific prefix, not broad prefix.""" + with patch("datadog_sync.utils.storage.aws_s3_bucket.boto3") as mock_boto3: + mock_client = MagicMock() + mock_boto3.client.return_value = mock_client + + # Empty paginated response + mock_client.list_objects_v2.return_value = {"IsTruncated": False} + + from datadog_sync.utils.storage.aws_s3_bucket import AWSS3Bucket + backend = AWSS3Bucket( + source_resources_path="resources/source", + destination_resources_path="resources/destination", + config={"aws_bucket_name": "test-bucket", "aws_region_name": "us-east-1", + "aws_access_key_id": "", "aws_secret_access_key": "", "aws_session_token": ""}, + ) + + backend.get(Origin.SOURCE, resource_types=["roles"]) + + # Should list with type-specific prefix, not broad prefix + call_kwargs = mock_client.list_objects_v2.call_args_list[0][1] + assert call_kwargs["Prefix"] == "resources/source/roles." + assert "dashboard" not in call_kwargs["Prefix"] + + def test_s3_get_broad_prefix_when_no_resource_types(self): + """S3 get() without resource_types uses the broad prefix (existing behavior).""" + with patch("datadog_sync.utils.storage.aws_s3_bucket.boto3") as mock_boto3: + mock_client = MagicMock() + mock_boto3.client.return_value = mock_client + mock_client.list_objects_v2.return_value = {"IsTruncated": False} + + from datadog_sync.utils.storage.aws_s3_bucket import AWSS3Bucket + backend = AWSS3Bucket( + source_resources_path="resources/source", + destination_resources_path="resources/destination", + config={"aws_bucket_name": "test-bucket", "aws_region_name": "us-east-1", + "aws_access_key_id": "", "aws_secret_access_key": "", "aws_session_token": ""}, + ) + + backend.get(Origin.SOURCE, resource_types=None) + + call_kwargs = mock_client.list_objects_v2.call_args_list[0][1] + assert call_kwargs["Prefix"] == "resources/source" From 4745702a56fb57126ab79b1e452da3a08bb804aa Mon Sep 17 00:00:00 2001 From: "michael.richey" Date: Thu, 23 Apr 2026 11:06:59 -0400 Subject: [PATCH 2/3] style: apply black formatting and fix ruff warnings Co-Authored-By: Claude Sonnet 4.6 (1M context) --- datadog_sync/commands/sync.py | 10 ++++---- tests/unit/test_minimize_reads_type_scoped.py | 23 ++++++++++++++----- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/datadog_sync/commands/sync.py b/datadog_sync/commands/sync.py index 65bd5eaf2..a000837ef 100644 --- a/datadog_sync/commands/sync.py +++ b/datadog_sync/commands/sync.py @@ -32,11 +32,11 @@ default=False, show_default=True, help="Minimize cloud storage reads by loading only needed resources. " - "Uses ID-targeted fetching when filters specify exact IDs, " - "otherwise falls back to type-scoped loading. " - "Requires --resource-per-file and --resources. " - "Must not be combined with --cleanup. " - "Only available on the sync command.", + "Uses ID-targeted fetching when filters specify exact IDs, " + "otherwise falls back to type-scoped loading. " + "Requires --resource-per-file and --resources. " + "Must not be combined with --cleanup. " + "Only available on the sync command.", cls=CustomOptionClass, ) def sync(**kwargs): diff --git a/tests/unit/test_minimize_reads_type_scoped.py b/tests/unit/test_minimize_reads_type_scoped.py index 65179b991..00b4f0359 100644 --- a/tests/unit/test_minimize_reads_type_scoped.py +++ b/tests/unit/test_minimize_reads_type_scoped.py @@ -5,9 +5,8 @@ """Tests for PR 2: type-scoped loading via --minimize-reads flag (type-scoped strategy).""" -import json from pathlib import Path -from unittest.mock import MagicMock, call, patch +from unittest.mock import MagicMock, patch import pytest from click.testing import CliRunner @@ -156,11 +155,17 @@ def test_s3_get_type_scoped_uses_per_type_prefix(self): mock_client.list_objects_v2.return_value = {"IsTruncated": False} from datadog_sync.utils.storage.aws_s3_bucket import AWSS3Bucket + backend = AWSS3Bucket( source_resources_path="resources/source", destination_resources_path="resources/destination", - config={"aws_bucket_name": "test-bucket", "aws_region_name": "us-east-1", - "aws_access_key_id": "", "aws_secret_access_key": "", "aws_session_token": ""}, + config={ + "aws_bucket_name": "test-bucket", + "aws_region_name": "us-east-1", + "aws_access_key_id": "", + "aws_secret_access_key": "", + "aws_session_token": "", + }, ) backend.get(Origin.SOURCE, resource_types=["roles"]) @@ -178,11 +183,17 @@ def test_s3_get_broad_prefix_when_no_resource_types(self): mock_client.list_objects_v2.return_value = {"IsTruncated": False} from datadog_sync.utils.storage.aws_s3_bucket import AWSS3Bucket + backend = AWSS3Bucket( source_resources_path="resources/source", destination_resources_path="resources/destination", - config={"aws_bucket_name": "test-bucket", "aws_region_name": "us-east-1", - "aws_access_key_id": "", "aws_secret_access_key": "", "aws_session_token": ""}, + config={ + "aws_bucket_name": "test-bucket", + "aws_region_name": "us-east-1", + "aws_access_key_id": "", + "aws_secret_access_key": "", + "aws_session_token": "", + }, ) backend.get(Origin.SOURCE, resource_types=None) From 2c24451e4482b6cea627d9f6dd4c8dd63187816b Mon Sep 17 00:00:00 2001 From: "michael.richey" Date: Fri, 24 Apr 2026 14:03:24 -0400 Subject: [PATCH 3/3] fix(sync): address PR #544 review comments - Add UsageError when --minimize-reads combined with --cleanup (data-loss guard) - Clean up dev-note comments: 'MODIFIED' and 'PR 3' references removed - Add test_minimize_reads_cannot_be_combined_with_cleanup Co-Authored-By: Claude Sonnet 4.6 --- datadog_sync/utils/configuration.py | 4 +++- datadog_sync/utils/state.py | 2 +- tests/unit/test_minimize_reads_type_scoped.py | 17 +++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/datadog_sync/utils/configuration.py b/datadog_sync/utils/configuration.py index e389f3978..a85f4274d 100644 --- a/datadog_sync/utils/configuration.py +++ b/datadog_sync/utils/configuration.py @@ -314,8 +314,10 @@ def build_config(cmd: Command, **kwargs: Optional[Any]) -> Configuration: raise click.UsageError("--minimize-reads requires --resource-per-file") if not kwargs.get("resources", None): raise click.UsageError("--minimize-reads requires --resources") + if kwargs.get("cleanup") and kwargs["cleanup"].lower() in ("true", "force"): + raise click.UsageError("--minimize-reads cannot be combined with --cleanup") - # Determine type-scoped loading (ID-targeted loading added in PR 3) + # Determine loading strategy for minimize-reads _state_resource_types = None # None = full load (existing behavior) if minimize_reads and (rs := kwargs.get("resources", None)): _state_resource_types = [r.strip().lower() for r in rs.split(",") if r.strip()] diff --git a/datadog_sync/utils/state.py b/datadog_sync/utils/state.py index f88fce56e..8ae28df09 100644 --- a/datadog_sync/utils/state.py +++ b/datadog_sync/utils/state.py @@ -78,7 +78,7 @@ def destination(self): return self._data.destination def load_state(self, origin: Origin = Origin.ALL) -> None: - # MODIFIED: resource_types param added. None = full load (existing behavior). + # resource_types=None → load all types (default behavior) self._data = self._storage.get(origin, resource_types=self._resource_types) def dump_state(self, origin: Origin = Origin.ALL) -> None: diff --git a/tests/unit/test_minimize_reads_type_scoped.py b/tests/unit/test_minimize_reads_type_scoped.py index 00b4f0359..40d263ecf 100644 --- a/tests/unit/test_minimize_reads_type_scoped.py +++ b/tests/unit/test_minimize_reads_type_scoped.py @@ -143,6 +143,23 @@ def test_minimize_reads_not_available_on_import_command(self, runner): assert result.exit_code != 0 assert "no such option" in result.output.lower() + def test_minimize_reads_cannot_be_combined_with_cleanup(self, runner): + """--minimize-reads + --cleanup must be rejected before any I/O.""" + result = runner.invoke( + cli, + [ + "sync", + "--minimize-reads", + "--resource-per-file", + "--resources=roles", + "--cleanup=Force", + "--source-api-key=x", + "--destination-api-key=y", + ], + ) + assert result.exit_code != 0 + assert "cleanup" in (result.output + str(result.exception)).lower() + class TestS3TypeScopedGet: def test_s3_get_type_scoped_uses_per_type_prefix(self):