From 648d0a9b1ffe736b265c40e91b4a74c05434bca3 Mon Sep 17 00:00:00 2001 From: Kshitij Chawla Date: Tue, 2 Jul 2024 14:59:59 +0530 Subject: [PATCH 01/14] Bug 3323988: Regex fix and indices correction for model download --- sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_constants.py | 2 +- sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_storage_utils.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_constants.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_constants.py index e901753d2f1d..5b0b2e99e5a0 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_constants.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_constants.py @@ -41,7 +41,7 @@ GEN2_STORAGE_CLIENT_NAME = "Gen2StorageClient" DEFAULT_CONNECTION_TIMEOUT = 14400 STORAGE_URI_REGEX = ( - r"(https:\/\/([a-zA-Z0-9@:%_\\\-+~#?&=]+)[a-zA-Z0-9@:%._\\\-+~#?&=]+\.?)\/([a-zA-Z0-9@:%._\\\-+~#?&=]+)\/(.*)" + r"(https:\/\/([a-zA-Z0-9@:%_\\\-+~#?&=]+)[a-zA-Z0-9@:%._\\\-+~#?&=]+\.?)\/([a-zA-Z0-9@:%._\\\-+~#?&=]+)\/?(.*)" ) WORKSPACE_MANAGED_DATASTORE_WITH_SLASH = "azureml://datastores/workspacemanageddatastore/" diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_storage_utils.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_storage_utils.py index 9d6fe6983ad6..622a3d8ad186 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_storage_utils.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_storage_utils.py @@ -191,7 +191,10 @@ def get_ds_name_and_path_prefix(asset_uri: str, registry_name: Optional[str] = N if registry_name: try: split_paths = re.findall(STORAGE_URI_REGEX, asset_uri) - path_prefix = split_paths[0][3] + if split_paths[0][3] == "": + path_prefix = split_paths[0][2] + else: + path_prefix = split_paths[0][3] except Exception as e: msg = "Registry asset URI could not be parsed." raise MlException(message=msg, no_personal_data_message=msg) from e From f1250535e0904a7a3c6f518fcf28b28a39cf2021 Mon Sep 17 00:00:00 2001 From: Kshitij Chawla Date: Wed, 17 Jul 2024 12:52:56 +0530 Subject: [PATCH 02/14] fixing test case --- .../unittests/test_storage_utils.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/sdk/ml/azure-ai-ml/tests/internal_utils/unittests/test_storage_utils.py b/sdk/ml/azure-ai-ml/tests/internal_utils/unittests/test_storage_utils.py index 19a4c71e7c49..9b2a01327b36 100644 --- a/sdk/ml/azure-ai-ml/tests/internal_utils/unittests/test_storage_utils.py +++ b/sdk/ml/azure-ai-ml/tests/internal_utils/unittests/test_storage_utils.py @@ -1,20 +1,15 @@ -import pytest - -from azure.ai.ml._utils._storage_utils import get_ds_name_and_path_prefix from pathlib import Path from unittest.mock import Mock, patch +import pytest + from azure.ai.ml._scope_dependent_operations import OperationConfig, OperationScope +from azure.ai.ml._utils._storage_utils import get_ds_name_and_path_prefix from azure.ai.ml.entities._assets import Code, Data, Environment, Model from azure.ai.ml.entities._assets._artifacts.artifact import ArtifactStorageInfo -from azure.ai.ml.operations import ( - DataOperations, - DatastoreOperations, - EnvironmentOperations, - ModelOperations, -) -from azure.ai.ml.operations._code_operations import CodeOperations from azure.ai.ml.exceptions import ErrorTarget +from azure.ai.ml.operations import DataOperations, DatastoreOperations, EnvironmentOperations, ModelOperations +from azure.ai.ml.operations._code_operations import CodeOperations @pytest.fixture @@ -125,7 +120,7 @@ def test_storage_uri_to_prefix( def test_storage_uri_to_prefix_malformed( self, ) -> None: - reg_uri_bad = "https://ccccccccddd4512d.blob.core.windows.net/5823bbb4-bb28-497c-b9f2-1ff3a0778b10" + reg_uri_bad = "https://ccccccccddd4512d.blob.core.windows.net" workspace_uri_bad = "azureml://subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/000000000000000/workspaces/some_test_3/datastores/workspaceblobstore/path/LocalUpload/26960525964086056a7301dd061fb9be/lightgbm_mlflow_model" with pytest.raises(Exception) as e: From 921e80593682779185f8d0fd17bb849fcc920c2e Mon Sep 17 00:00:00 2001 From: Kshitij Chawla Date: Tue, 23 Jul 2024 16:27:48 +0530 Subject: [PATCH 03/14] adding mlflow tracking uri func --- .../ai/ml/entities/_workspace/workspace.py | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py index 5dc635b5962b..cb9554e7f355 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py @@ -28,7 +28,13 @@ from azure.ai.ml.entities._resource import Resource from azure.ai.ml.entities._util import find_field_in_override, load_from_dict from azure.ai.ml.entities._workspace.serverless_compute import ServerlessComputeSettings -from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationErrorType, ValidationException +from azure.ai.ml.exceptions import ( + ErrorCategory, + ErrorTarget, + UserErrorException, + ValidationErrorType, + ValidationException, +) from .customer_managed_key import CustomerManagedKey from .feature_store_settings import FeatureStoreSettings @@ -222,7 +228,22 @@ def mlflow_tracking_uri(self) -> Optional[str]: :return: Returns mlflow tracking uri of the workspace. :rtype: str """ - return self._mlflow_tracking_uri + # if _with_auth: + # module_logger.warning( + # "'_with_auth' is deprecated and will be removed in a future release. ") + + try: + from azureml.mlflow import get_mlflow_tracking_uri_v2 + + return get_mlflow_tracking_uri_v2(self) + except ImportError as e: + error_msg = ( + "azureml.mlflow could not be imported. " + "Please ensure that 'azureml-mlflow' has been installed in the current python environment." + ) + raise UserErrorException(error_msg) from e + + # return self._mlflow_tracking_uri def dump(self, dest: Union[str, PathLike, IO[AnyStr]], **kwargs: Any) -> None: """Dump the workspace spec into a file in yaml format. From e6472046650dae6e8f2d00ae6dc5c7e2730bfb3a Mon Sep 17 00:00:00 2001 From: Kshitij Chawla Date: Fri, 2 Aug 2024 13:57:34 +0530 Subject: [PATCH 04/14] passing service context to azureml mlflow --- .../entities/_workspace/_ai_workspaces/hub.py | 4 +- .../ai/ml/entities/_workspace/workspace.py | 42 ++++++++----------- .../operations/_workspace_operations_base.py | 18 +++++--- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/_ai_workspaces/hub.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/_ai_workspaces/hub.py index 085089094876..58a753d5df1b 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/_ai_workspaces/hub.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/_ai_workspaces/hub.py @@ -129,11 +129,11 @@ def _get_schema_class(cls): return HubSchema @classmethod - def _from_rest_object(cls, rest_obj: RestWorkspace) -> Optional["Hub"]: + def _from_rest_object(cls, rest_obj: RestWorkspace, v2_service_context: Optional[object]) -> Optional["Hub"]: if not rest_obj: return None - workspace_object = Workspace._from_rest_object(rest_obj) + workspace_object = Workspace._from_rest_object(rest_obj, v2_service_context) default_resource_group = None diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py index cb9554e7f355..0d337167e1b9 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py @@ -4,6 +4,7 @@ # pylint: disable=too-many-instance-attributes +import warnings from os import PathLike from pathlib import Path from typing import IO, Any, AnyStr, Dict, List, Optional, Tuple, Type, Union @@ -28,13 +29,7 @@ from azure.ai.ml.entities._resource import Resource from azure.ai.ml.entities._util import find_field_in_override, load_from_dict from azure.ai.ml.entities._workspace.serverless_compute import ServerlessComputeSettings -from azure.ai.ml.exceptions import ( - ErrorCategory, - ErrorTarget, - UserErrorException, - ValidationErrorType, - ValidationException, -) +from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationErrorType, ValidationException from .customer_managed_key import CustomerManagedKey from .feature_store_settings import FeatureStoreSettings @@ -228,22 +223,9 @@ def mlflow_tracking_uri(self) -> Optional[str]: :return: Returns mlflow tracking uri of the workspace. :rtype: str """ - # if _with_auth: - # module_logger.warning( - # "'_with_auth' is deprecated and will be removed in a future release. ") - - try: - from azureml.mlflow import get_mlflow_tracking_uri_v2 - - return get_mlflow_tracking_uri_v2(self) - except ImportError as e: - error_msg = ( - "azureml.mlflow could not be imported. " - "Please ensure that 'azureml-mlflow' has been installed in the current python environment." - ) - raise UserErrorException(error_msg) from e + # TODO: To check with Amit the use of this function - # return self._mlflow_tracking_uri + return self._mlflow_tracking_uri def dump(self, dest: Union[str, PathLike, IO[AnyStr]], **kwargs: Any) -> None: """Dump the workspace spec into a file in yaml format. @@ -336,7 +318,7 @@ def _load( return result @classmethod - def _from_rest_object(cls, rest_obj: RestWorkspace) -> Optional["Workspace"]: + def _from_rest_object(cls, rest_obj: RestWorkspace, v2_service_context: Optional[object]) -> Optional["Workspace"]: if not rest_obj: return None @@ -352,8 +334,20 @@ def _from_rest_object(cls, rest_obj: RestWorkspace) -> Optional["Workspace"]: # TODO: Remove attribute check once Oct API version is out mlflow_tracking_uri = None + if hasattr(rest_obj, "ml_flow_tracking_uri"): - mlflow_tracking_uri = rest_obj.ml_flow_tracking_uri + try: + from azureml.mlflow import get_mlflow_tracking_uri_v2 + + mlflow_tracking_uri = get_mlflow_tracking_uri_v2(rest_obj.ml_flow_tracking_uri, v2_service_context) + except ImportError: + mlflow_tracking_uri = rest_obj.ml_flow_tracking_uri + error_msg = ( + "azureml.mlflow could not be imported. azureml-mlflow will not use credentials passed to `MLClient`" + "Please ensure that latest 'azureml-mlflow' has been installed in the current python environment. " + ) + warnings.warn(error_msg, UserWarning) + # mlflow_tracking_uri = rest_obj.ml_flow_tracking_uri # TODO: Remove once Online Endpoints updates API version to at least 2023-08-01 allow_roleassignment_on_rg = None diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py index cf2ad56dee3a..869628dd901c 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py @@ -34,11 +34,11 @@ from azure.ai.ml._version import VERSION from azure.ai.ml.constants import ManagedServiceIdentityType from azure.ai.ml.constants._common import ( + WORKSPACE_PATCH_REJECTED_KEYS, ArmConstants, LROConfigurations, WorkspaceKind, WorkspaceResourceConstants, - WORKSPACE_PATCH_REJECTED_KEYS, ) from azure.ai.ml.constants._workspace import IsolationMode, OutboundRuleCategory from azure.ai.ml.entities import Hub, Project, Workspace @@ -85,11 +85,19 @@ def get(self, workspace_name: Optional[str] = None, **kwargs: Any) -> Optional[W workspace_name = self._check_workspace_name(workspace_name) resource_group = kwargs.get("resource_group") or self._resource_group_name obj = self._operation.get(resource_group, workspace_name) + v2_service_context = {} + v2_service_context.subscription_id = self._subscription_id + v2_service_context.workspace_name = workspace_name + v2_service_context.resource_group_name = resource_group + v2_service_context.auth = self._credentials + # host_url=service_context._get_mlflow_url(), + # cloud=_get_cloud_or_default( + # service_context.get_auth()._cloud_type.name if obj is not None and obj.kind is not None and obj.kind.lower() == WorkspaceKind.HUB: - return Hub._from_rest_object(obj) + return Hub._from_rest_object(obj, v2_service_context) if obj is not None and obj.kind is not None and obj.kind.lower() == WorkspaceKind.PROJECT: - return Project._from_rest_object(obj) - return Workspace._from_rest_object(obj) + return Project._from_rest_object(obj, v2_service_context) + return Workspace._from_rest_object(obj, v2_service_context) def begin_create( self, @@ -418,7 +426,7 @@ def callback(_: Any, deserialized: Any, args: Any) -> Workspace: return ( deserialize_callback(deserialized) if deserialize_callback - else Workspace._from_rest_object(deserialized) + else Workspace._from_rest_object(deserialized, None) ) real_callback = callback From a03c7a3d36198fa437bbb7633f8c219fea0dd59d Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Thu, 5 Sep 2024 12:58:57 +0530 Subject: [PATCH 05/14] final flow APC complete --- .../azure/ai/ml/entities/_workspace/workspace.py | 2 +- .../ml/operations/_workspace_operations_base.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py index 0d337167e1b9..bbc395fc36dd 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py @@ -339,7 +339,7 @@ def _from_rest_object(cls, rest_obj: RestWorkspace, v2_service_context: Optional try: from azureml.mlflow import get_mlflow_tracking_uri_v2 - mlflow_tracking_uri = get_mlflow_tracking_uri_v2(rest_obj.ml_flow_tracking_uri, v2_service_context) + mlflow_tracking_uri = get_mlflow_tracking_uri_v2(rest_obj, v2_service_context) except ImportError: mlflow_tracking_uri = rest_obj.ml_flow_tracking_uri error_msg = ( diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py index 895f55cf7ed5..9743314a50a5 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py @@ -86,10 +86,17 @@ def get(self, workspace_name: Optional[str] = None, **kwargs: Any) -> Optional[W resource_group = kwargs.get("resource_group") or self._resource_group_name obj = self._operation.get(resource_group, workspace_name) v2_service_context = {} - v2_service_context.subscription_id = self._subscription_id - v2_service_context.workspace_name = workspace_name - v2_service_context.resource_group_name = resource_group - v2_service_context.auth = self._credentials + + v2_service_context["subscription_id"] = self._subscription_id + v2_service_context["workspace_name"] = workspace_name + v2_service_context["resource_group_name"] = resource_group + v2_service_context["auth"] = self._credentials + + from urllib.parse import urlparse + parsed_url = urlparse(obj.ml_flow_tracking_uri) + host_url = parsed_url.netloc + v2_service_context['host_url'] = host_url + # host_url=service_context._get_mlflow_url(), # cloud=_get_cloud_or_default( # service_context.get_auth()._cloud_type.name From 3dbb2d2b3b1b56e17959dd4e662fe0fbad09d421 Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Wed, 25 Sep 2024 17:21:06 +0530 Subject: [PATCH 06/14] modify host_url --- sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py | 2 +- .../azure/ai/ml/operations/_workspace_operations_base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py index ef4d8fbbd791..0157d84f9004 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py @@ -317,7 +317,7 @@ def _load( return result @classmethod - def _from_rest_object(cls, rest_obj: RestWorkspace, v2_service_context: Optional[object]) -> Optional["Workspace"]: + def _from_rest_object(cls, rest_obj: RestWorkspace, v2_service_context: Optional[object] = None) -> Optional["Workspace"]: if not rest_obj: return None diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py index 4ef0e5314381..894c500c47f3 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py @@ -95,7 +95,7 @@ def get(self, workspace_name: Optional[str] = None, **kwargs: Any) -> Optional[W from urllib.parse import urlparse parsed_url = urlparse(obj.ml_flow_tracking_uri) - host_url = parsed_url.netloc + host_url = "https://{}".format(parsed_url.netloc) v2_service_context['host_url'] = host_url # host_url=service_context._get_mlflow_url(), From 58a96dc6662fa500115e744b90d983de28faa167 Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Fri, 27 Sep 2024 18:07:16 +0530 Subject: [PATCH 07/14] fixing unit test cases --- .../entities/_feature_store/feature_store.py | 4 +++- .../entities/_workspace/_ai_workspaces/hub.py | 2 +- .../ai/ml/entities/_workspace/workspace.py | 6 +++--- .../operations/_workspace_operations_base.py | 20 +++++++++++-------- .../unittests/test_workspace_operations.py | 12 ++++++++++- .../test_workspace_operations_base.py | 10 ++++++++++ 6 files changed, 40 insertions(+), 14 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_feature_store/feature_store.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_feature_store/feature_store.py index a6db2c8a4f8b..62242986ca2a 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_feature_store/feature_store.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_feature_store/feature_store.py @@ -162,7 +162,9 @@ def __init__( self.managed_network = managed_network @classmethod - def _from_rest_object(cls, rest_obj: RestWorkspace) -> Optional["FeatureStore"]: + def _from_rest_object( + cls, rest_obj: RestWorkspace, v2_service_context: Optional[object] = None + ) -> Optional["FeatureStore"]: if not rest_obj: return None diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/_ai_workspaces/hub.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/_ai_workspaces/hub.py index 069602ec5a66..3bae3f8d58b5 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/_ai_workspaces/hub.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/_ai_workspaces/hub.py @@ -131,7 +131,7 @@ def _get_schema_class(cls): return HubSchema @classmethod - def _from_rest_object(cls, rest_obj: RestWorkspace, v2_service_context: Optional[object]) -> Optional["Hub"]: + def _from_rest_object(cls, rest_obj: RestWorkspace, v2_service_context: Optional[object] = None) -> Optional["Hub"]: if not rest_obj: return None diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py index 0157d84f9004..d865cc1d925e 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py @@ -222,8 +222,6 @@ def mlflow_tracking_uri(self) -> Optional[str]: :return: Returns mlflow tracking uri of the workspace. :rtype: str """ - # TODO: To check with Amit the use of this function - return self._mlflow_tracking_uri def dump(self, dest: Union[str, PathLike, IO[AnyStr]], **kwargs: Any) -> None: @@ -317,7 +315,9 @@ def _load( return result @classmethod - def _from_rest_object(cls, rest_obj: RestWorkspace, v2_service_context: Optional[object] = None) -> Optional["Workspace"]: + def _from_rest_object( + cls, rest_obj: RestWorkspace, v2_service_context: Optional[object] = None + ) -> Optional["Workspace"]: if not rest_obj: return None diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py index 894c500c47f3..528c35d92180 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_workspace_operations_base.py @@ -87,17 +87,21 @@ def get(self, workspace_name: Optional[str] = None, **kwargs: Any) -> Optional[W resource_group = kwargs.get("resource_group") or self._resource_group_name obj = self._operation.get(resource_group, workspace_name) v2_service_context = {} - + v2_service_context["subscription_id"] = self._subscription_id v2_service_context["workspace_name"] = workspace_name v2_service_context["resource_group_name"] = resource_group - v2_service_context["auth"] = self._credentials - + v2_service_context["auth"] = self._credentials # type: ignore + from urllib.parse import urlparse - parsed_url = urlparse(obj.ml_flow_tracking_uri) - host_url = "https://{}".format(parsed_url.netloc) - v2_service_context['host_url'] = host_url - + + if obj is not None and obj.ml_flow_tracking_uri: + parsed_url = urlparse(obj.ml_flow_tracking_uri) + host_url = "https://{}".format(parsed_url.netloc) + v2_service_context["host_url"] = host_url + else: + v2_service_context["host_url"] = "" + # host_url=service_context._get_mlflow_url(), # cloud=_get_cloud_or_default( # service_context.get_auth()._cloud_type.name @@ -436,7 +440,7 @@ def callback(_: Any, deserialized: Any, args: Any) -> Workspace: return ( deserialize_callback(deserialized) if deserialize_callback - else Workspace._from_rest_object(deserialized, None) + else Workspace._from_rest_object(deserialized) ) real_callback = callback diff --git a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py index 0f9b326e3a72..06664f6581d4 100644 --- a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py +++ b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py @@ -1,5 +1,5 @@ from typing import Optional -from unittest.mock import ANY, DEFAULT, MagicMock, Mock +from unittest.mock import ANY, DEFAULT, MagicMock, Mock, patch from uuid import UUID, uuid4 import pytest @@ -20,6 +20,7 @@ ) from azure.ai.ml.operations import WorkspaceOperations from azure.core.polling import LROPoller +import urllib.parse @pytest.fixture @@ -27,6 +28,15 @@ def mock_credential() -> Mock: yield Mock() +def mock_urlparse(url: str) -> urllib.parse.ParseResult: + return urllib.parse.ParseResult( + scheme="http", netloc="example.com", path="/index.html", params="", query="a=1&b=2", fragment="" + ) + + +urllib.parse.urlparse = mock_urlparse + + @pytest.fixture def mock_workspace_operation( mock_workspace_scope: OperationScope, diff --git a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py index 885560e0f5dd..b52d910bd4b0 100644 --- a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py +++ b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py @@ -31,6 +31,7 @@ ) from azure.ai.ml.operations._workspace_operations_base import WorkspaceOperationsBase from azure.core.polling import LROPoller +import urllib.parse @pytest.fixture @@ -38,6 +39,15 @@ def mock_credential() -> Mock: yield Mock() +def mock_urlparse(url: str) -> urllib.parse.ParseResult: + return urllib.parse.ParseResult( + scheme="http", netloc="example.com", path="/index.html", params="", query="a=1&b=2", fragment="" + ) + + +urllib.parse.urlparse = mock_urlparse + + @pytest.fixture def mock_workspace_operation_base( mock_workspace_scope: OperationScope, From 0ab7bc47752ce4daabb56dd7407862e9d6d9d1d6 Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Wed, 2 Oct 2024 00:44:51 +0530 Subject: [PATCH 08/14] changing mock for urlparse --- .../unittests/test_mocked_operations.py | 3 ++- .../unittests/test_workspace_operations.py | 18 +++++++----------- .../test_workspace_operations_base.py | 18 ++++++++---------- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/sdk/ml/azure-ai-ml/tests/workspace/ai_workspaces/unittests/test_mocked_operations.py b/sdk/ml/azure-ai-ml/tests/workspace/ai_workspaces/unittests/test_mocked_operations.py index ed567e3429df..6e98f1deaefd 100644 --- a/sdk/ml/azure-ai-ml/tests/workspace/ai_workspaces/unittests/test_mocked_operations.py +++ b/sdk/ml/azure-ai-ml/tests/workspace/ai_workspaces/unittests/test_mocked_operations.py @@ -42,7 +42,8 @@ def test_list(self, arg: str, mock_hub_operation: WorkspaceOperations) -> None: else: mock_hub_operation._operation.list_by_resource_group.assert_called_once() - def test_get(self, mock_hub_operation: WorkspaceOperations) -> None: + def test_get(self, mock_hub_operation: WorkspaceOperations, mocker: MockFixture) -> None: + mocker.patch("urllib.parse.urlparse") mock_hub_operation.get(name="random_name") mock_hub_operation._operation.get.assert_called_once() diff --git a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py index 06664f6581d4..97e43a1cc1c5 100644 --- a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py +++ b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py @@ -28,15 +28,6 @@ def mock_credential() -> Mock: yield Mock() -def mock_urlparse(url: str) -> urllib.parse.ParseResult: - return urllib.parse.ParseResult( - scheme="http", netloc="example.com", path="/index.html", params="", query="a=1&b=2", fragment="" - ) - - -urllib.parse.urlparse = mock_urlparse - - @pytest.fixture def mock_workspace_operation( mock_workspace_scope: OperationScope, @@ -97,7 +88,8 @@ def test_list(self, arg: str, mock_workspace_operation: WorkspaceOperations) -> else: mock_workspace_operation._operation.list_by_resource_group.assert_called_once() - def test_get(self, mock_workspace_operation: WorkspaceOperations) -> None: + def test_get(self, mock_workspace_operation: WorkspaceOperations, mocker: MockFixture) -> None: + mocker.patch("urllib.parse.urlparse") mock_workspace_operation.get("random_name") mock_workspace_operation._operation.get.assert_called_once() @@ -124,7 +116,8 @@ def test_begin_create( mocker.patch("azure.ai.ml._arm_deployments.ArmDeploymentExecutor.deploy_resource", return_value=LROPoller) mock_workspace_operation.begin_create(workspace=Workspace(name="name")) - def test_update(self, mock_workspace_operation: WorkspaceOperations) -> None: + def test_update(self, mock_workspace_operation: WorkspaceOperations, mocker: MockFixture) -> None: + mocker.patch("urllib.parse.urlparse") ws = Workspace( name="name", description="description", @@ -145,6 +138,7 @@ def outgoing_call(rg, name, params, polling, cls): def test_update_with_role_assignemnt( self, mock_workspace_operation: WorkspaceOperations, mocker: MockFixture ) -> None: + mocker.patch("urllib.parse.urlparse") mocker.patch( "azure.ai.ml.operations.WorkspaceOperations._populate_feature_store_role_assignment_parameters", return_value=({}, {}, {}), @@ -173,6 +167,7 @@ def outgoing_call(rg, name, params, polling, cls): mock_workspace_operation._operation.begin_update.assert_called() def test_delete(self, mock_workspace_operation: WorkspaceOperations, mocker: MockFixture) -> None: + mocker.patch("urllib.parse.urlparse") mocker.patch("azure.ai.ml.operations._workspace_operations_base.delete_resource_by_arm_id", return_value=None) mocker.patch( "azure.ai.ml.operations._workspace_operations_base.get_generic_arm_resource_by_arm_id", return_value=None @@ -181,6 +176,7 @@ def test_delete(self, mock_workspace_operation: WorkspaceOperations, mocker: Moc mock_workspace_operation._operation.begin_delete.assert_called_once() def test_purge(self, mock_workspace_operation: WorkspaceOperations, mocker: MockFixture) -> None: + mocker.patch("urllib.parse.urlparse") mocker.patch("azure.ai.ml.operations._workspace_operations_base.delete_resource_by_arm_id", return_value=None) mocker.patch( "azure.ai.ml.operations._workspace_operations_base.get_generic_arm_resource_by_arm_id", return_value=None diff --git a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py index b52d910bd4b0..c3dcc424e34c 100644 --- a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py +++ b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py @@ -39,15 +39,6 @@ def mock_credential() -> Mock: yield Mock() -def mock_urlparse(url: str) -> urllib.parse.ParseResult: - return urllib.parse.ParseResult( - scheme="http", netloc="example.com", path="/index.html", params="", query="a=1&b=2", fragment="" - ) - - -urllib.parse.urlparse = mock_urlparse - - @pytest.fixture def mock_workspace_operation_base( mock_workspace_scope: OperationScope, @@ -188,6 +179,8 @@ def test_create_get_exception_swallow( def test_begin_create_existing_ws( self, mock_workspace_operation_base: WorkspaceOperationsBase, mocker: MockFixture ): + mocker.patch("urllib.parse.urlparse") + def outgoing_call(rg, name, params, polling, cls): assert name == "name" return DEFAULT @@ -197,7 +190,8 @@ def outgoing_call(rg, name, params, polling, cls): mock_workspace_operation_base.begin_create(workspace=Workspace(name="name")) mock_workspace_operation_base._operation.begin_update.assert_called() - def test_update(self, mock_workspace_operation_base: WorkspaceOperationsBase) -> None: + def test_update(self, mock_workspace_operation_base: WorkspaceOperationsBase, mocker: MockFixture) -> None: + mocker.patch("urllib.parse.urlparse") ws = Workspace( name="name", tags={"key": "value"}, @@ -254,6 +248,7 @@ def outgoing_call(rg, name, params, polling, cls): def test_update_with_empty_property_values( self, mock_workspace_operation_base: WorkspaceOperationsBase, mocker: MockFixture ) -> None: + mocker.patch("urllib.parse.urlparse") ws = Workspace(name="name", description="", display_name="", image_build_compute="") mocker.patch("azure.ai.ml.operations.WorkspaceOperations.get", return_value=ws) @@ -277,6 +272,7 @@ def outgoing_call(rg, name, params, polling, cls): mock_workspace_operation_base._operation.begin_update.assert_called() def test_delete_no_wait(self, mock_workspace_operation_base: WorkspaceOperationsBase, mocker: MockFixture) -> None: + mocker.patch("urllib.parse.urlparse") mocker.patch("azure.ai.ml.operations._workspace_operations_base.delete_resource_by_arm_id", return_value=None) mocker.patch( "azure.ai.ml.operations._workspace_operations_base.get_generic_arm_resource_by_arm_id", return_value=None @@ -285,6 +281,7 @@ def test_delete_no_wait(self, mock_workspace_operation_base: WorkspaceOperations mock_workspace_operation_base._operation.begin_delete.assert_called_once() def test_delete_wait(self, mock_workspace_operation_base: WorkspaceOperationsBase, mocker: MockFixture) -> None: + mocker.patch("urllib.parse.urlparse") mocker.patch("azure.ai.ml.operations._workspace_operations_base.delete_resource_by_arm_id", return_value=None) mocker.patch( "azure.ai.ml.operations._workspace_operations_base.get_generic_arm_resource_by_arm_id", return_value=None @@ -610,6 +607,7 @@ def test_update_workspace_with_serverless_custom_vnet( mock_workspace_operation_base: WorkspaceOperationsBase, mocker: MockFixture, ) -> None: + mocker.patch("urllib.parse.urlparse") ws = Workspace(name="name", location="test", serverless_compute=serverless_compute_settings) spy = mocker.spy(mock_workspace_operation_base._operation, "begin_update") mock_workspace_operation_base.begin_update(ws) From 5a90da5be11ac76adfc147319e39f5cda8a3b07e Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Thu, 24 Oct 2024 16:33:41 +0530 Subject: [PATCH 09/14] fixing the log msg --- .../azure/ai/ml/entities/_workspace/workspace.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py index d865cc1d925e..4317cb3f338f 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/entities/_workspace/workspace.py @@ -4,7 +4,6 @@ # pylint: disable=too-many-instance-attributes -import warnings from os import PathLike from pathlib import Path from typing import IO, Any, AnyStr, Dict, List, Optional, Tuple, Type, Union @@ -342,11 +341,11 @@ def _from_rest_object( except ImportError: mlflow_tracking_uri = rest_obj.ml_flow_tracking_uri error_msg = ( - "azureml.mlflow could not be imported. azureml-mlflow will not use credentials passed to `MLClient`" - "Please ensure that latest 'azureml-mlflow' has been installed in the current python environment. " + "azureml.mlflow could not be imported. " + "Please ensure that latest 'azureml-mlflow' has been installed in the current python environment" ) - warnings.warn(error_msg, UserWarning) - # mlflow_tracking_uri = rest_obj.ml_flow_tracking_uri + print(error_msg) + # warnings.warn(error_msg, UserWarning) # TODO: Remove once Online Endpoints updates API version to at least 2023-08-01 allow_roleassignment_on_rg = None From dc9cba198f83ccd15247efba553c5ff4b08460f6 Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Thu, 12 Dec 2024 14:31:19 +0530 Subject: [PATCH 10/14] first draft : YAML signing --- .../azure/ai/ml/YAMLsigning/command_line.py | 366 ++++++ .../azure/ai/ml/YAMLsigning/config.yml | 35 + .../azure/ai/ml/YAMLsigning/configuration.py | 214 ++++ .../azure/ai/ml/YAMLsigning/utils.py | 129 +++ .../azure/ai/ml/YAMLsigning/yamlSign.py | 1029 +++++++++++++++++ .../ai/ml/YAMLsigning/yamlSignTest/2.yaml | 3 + .../ai/ml/YAMLsigning/yamlSignTest/3.yaml | 5 + .../ai/ml/YAMLsigning/yamlSignTest/4.yaml | 6 + .../ml/YAMLsigning/yamlSignTest/catalog.json | 1 + .../YAMLsigning/yamlSignTest/catalog.json.sig | 1 + .../ai/ml/YAMLsigning/yamlSignTest/spec.yaml | 2 + 11 files changed, 1791 insertions(+) create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/config.yml create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/configuration.py create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/2.yaml create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/3.yaml create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/4.yaml create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json.sig create mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/spec.yaml diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py new file mode 100644 index 000000000000..9f6b00ab2607 --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py @@ -0,0 +1,366 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from abc import ABC, abstractmethod +import base64 +import logging +from omegaconf import OmegaConf +from pathlib import Path +import subprocess +import sys +import os +from typing import Any, Dict, List, Optional, Tuple, Union +from dataclasses import asdict + +# from shrike import __version__ +from configuration import Configuration, load_configuration +# from utils import TelemetryLogger + +log = logging.getLogger(__name__) + + +class _LogEmphasize: + def __init__(self, line: str): + self.line = line + + def __enter__(self): + log.info(self.line) + + def __exit__(self, exc_type, exc_value, traceback): + log.info(self.line) + + +class Command(ABC): + """ + Commands exposed by this package should subclass this class and implement + the `run_with_config` method. They should be invoked by calling + `Subclass().run()` inside their module's `__main__` logic. + """ + + @abstractmethod + def __init__(self) -> None: + self.config: Configuration = None # type: ignore + self._component_statuses: Dict[str, Dict[str, str]] = {} + self._errors: List[str] = [] + self.nb_cores = 1 + + def attach_workspace(self, workspace_id: str = None) -> None: # type: ignore + """ + Run `az ml folder attach` to the configured workspace ID. Default to the + first configured workspace if none is provided. + """ + # self.config.working_directory = "C:\Projects\\azure-sdk-for-python\sdk\ml\\azure-ai-ml\\azure\\ai\ml\YAMLsigning" + working_direcotry = self.config.working_directory + if workspace_id is None: + if not self.config.workspaces and self.config.registries: + workspace_id = self.config.validation_workspace + if not workspace_id: + self.register_error("No workspaces are configured. If you want to publish to registries only, please specify one workspace string in `validation_workspace` for validating components.") + return + else: + try: + workspace_id = self.config.workspaces[0] + except IndexError: + self.register_error( + f"No workspaces are configured. Please include them in your configuration file and ensure the path to your configuration file is correct relative to the working directory {working_direcotry} using `--configuration-file PATH/TO/CONFIGURATION_FILE`." + ) + return + + (subscription_id, resource_group, workspace) = self.parse_workspace_arm_id( + workspace_id + ) + success = self.execute_azure_cli_command( + f"account set --subscription {subscription_id}" + ) + dir = "C:\Projects\\azure-sdk-for-python\sdk\ml\\azure-ai-ml\\azure\\ai\ml\YAMLsigning" + print(working_direcotry, dir) + success = success and self.execute_azure_cli_command( + f"ml data create --name dataSource --path {dir} --type uri_folder -w {workspace} -g {resource_group}" + # f"ml folder attach --workspace-name {workspace} --resource-group {resource_group} --debug" # TODO: command modified for v2 + ) + if not success: + self.register_error(f"Error!! Failed to attach to {workspace_id}!") + + def display_all_statuses(self) -> None: + """ + Display all component statuses in an easily readable format. + """ + pass + + def emphasize(self, line: str = "#" * 80) -> _LogEmphasize: + """ + Use this to initialize a `with` block for emphasizing any logs inside + that block. + """ + return _LogEmphasize(line) + + def ensure_component_cli_installed(self) -> bool: + """ + Check if the component CLI is installed; + install it if not. + # TODO get cli version as config. + """ + + # Check whether the component CLI is installed + component_cli_exists = self.execute_azure_cli_command( + "extension show -n ml", + stderr_is_failure=False, + log_error=False, + ) + + if component_cli_exists: + log.info("component CLI exists. Skipping installation.") + return True + else: + log.info( + f"installing component CLI version {self.config.component_cli_version}." + ) + cli_install_command = f"extension add --name ml" + # cli_install_command = f"extension add --source https://azuremlsdktestpypi.blob.core.windows.net/wheels/componentsdk/azure_cli_ml-{self.config.component_cli_version}-py3-none-any.whl --pip-extra-index-urls https://azuremlsdktestpypi.azureedge.net/componentsdk/{self.config.component_cli_version} --yes" # TODO: command modified for v2 + if self.config.verbose: + cli_install_command += " --verbose" + + is_installed = self.execute_azure_cli_command( + command=cli_install_command, + # installation may show time to install + stderr_is_failure=False, + ) + + if is_installed: + log.info("component CLI is installed.") + else: + log.error("component CLI installation failed.") + + return is_installed + + def execute_azure_cli_command( + self, + command: str, + working_dir: Optional[str] = None, + stderr_is_failure: bool = True, + fail_if_version_exists: bool = False, + log_error: bool = True, + ) -> bool: + """ + Use this method, NOT `execute_command`, for running Azure CLI commands. + The `command` string should contain everything AFTER the `az`. + + This does NOT use the `azure-cli-core` Python package + ( https://stackoverflow.com/a/55960725 ) because it takes a long time + to install, and does not work in Windows. + + This method is necessary for subtle reasons around the way Azure CLI + exposes commands. The "naive approach" doesn't work. + """ + log.debug(f"Executing: az {command}") + az_command_bytes = bytes(f"az {command}", "utf-16le") + az_command_b64 = base64.b64encode(az_command_bytes).decode("ascii") + pwsh_command = ["pwsh", "-EncodedCommand", az_command_b64] + success = self.execute_command( + pwsh_command, working_dir, stderr_is_failure, fail_if_version_exists, log_error + ) + return success + + def execute_command( + self, + command: List[str], + working_dir: Optional[str] = None, + stderr_is_failure: bool = True, + fail_if_version_exists: bool = False, + log_error: bool = True, + ) -> bool: + """ + Execute the provided shell command using the configured timeout. Working + directory defaults to the configured one. If `stderr_is_failure` is + set to false, stderr from the command will be converted to "vanilla" + logs and will not affect success; + + Logs are NOT streamed realtime - they are "bundled together" after the + command executes or times out. + + Warning: running `az *` naively via this function will not work, since + the Azure CLI is not, by default, discoverable via `subprocess.run`. + """ + if working_dir is None: + working_dir = self.config.working_directory + + if len(command) > 0 and command[0] == "az": + raise ValueError( + "Do not run Azure CLI commands with this function. Use execute_azure_cli_command instead." + ) + + kwargs = {} + + if stderr_is_failure or fail_if_version_exists: + kwargs["stderr"] = subprocess.PIPE + + log.debug(f"Executing {command} in {working_dir}") + + timeout = self.config.shell_command_timeout_in_seconds + + try: + res = subprocess.run( + args=command, + cwd=working_dir, + stdout=subprocess.PIPE, + timeout=timeout, + **kwargs, + ) + + success = res.returncode == 0 + + if not success: + if log_error: + log.error(f"Command failed with exit code {res.returncode}") + else: + log.info(f"Command failed with exit code {res.returncode}") + + stdout = res.stdout + stderr = res.stderr + except subprocess.TimeoutExpired as e: + log.error(f"Command timed out after {timeout} seconds.") + success = False + stdout = e.stdout + stderr = e.stderr + + if stdout: + for line in stdout.splitlines(): + try: + line = str(line, encoding="utf-8", errors="ignore") # type: ignore + except: + log.debug( + "Failed to convert the following stdout line into String (utf-8)" + ) + log.info(line) + if stderr: + for line in stderr.splitlines(): + try: + line = str(line, encoding="utf-8", errors="ignore") # type: ignore + except: + log.debug( + "Failed to convert the following stdout line into String (utf-8)" + ) + if stderr_is_failure: + log.error(line) + success = False + elif fail_if_version_exists and "Error" in line and "already exists in" in line: # type: ignore + log.error(line) + success = False + else: + log.info(line) + + return success + + def normalize_path(self, path: Union[str, Path], directory=False) -> str: + """ + Normalize the provided path (file or directory) to the following format: + - Absolute (not relative) + - Linux-style (forward slash separating directories) + - If `directory=True`, ending in a forward slash. + """ + if isinstance(path, str): + path = Path(path) + + path = str(path.absolute()) + rv = path.replace("\\", "/") + + if directory and not rv[-1] == "/": + rv += "/" + + return rv + + def parse_workspace_arm_id(self, id: str) -> Tuple[str, str, str]: + """ + Parse a workspace ARM ID like + `/subscriptions/48bbc269-ce89-4f6f-9a12-c6f91fcb772d/resourceGroups/aml1p-rg/providers/Microsoft.MachineLearningServices/workspaces/aml1p-ml-wus2` + and return (subscription ID, resource group, workspace name). + """ + split = id.split("/") + subscription = split[2] + resource_group = split[4] + workspace = split[8] + return (subscription, resource_group, workspace) + + def register_component_status( + self, component_name: str, status_name: str, status: str + ) -> None: + """ + Register a status (e.g., build = failed) for a specified component. All + statuses will be displayed in a friendly manner before exiting. + """ + if component_name not in self._component_statuses: + self._component_statuses[component_name] = {} + + status_dict = self._component_statuses[component_name] + status_dict[status_name] = status + + def register_error(self, error: str) -> None: + """ + Register that an error has occured (also, log it). If any errors have + been registered, the `run` method will return with non-zero exit code. + """ + log.error(error) + self._errors.append(error) + + # def telemetry_logging(self, command: str) -> None: + # """ + # Log the telemetry information in the Azure Application Insights + # """ + # telemetry_logger = TelemetryLogger( + # enable_telemetry=not self.config.disable_telemetry + # ) + # telemetry_logger.log_trace( + # message=f"shrike.build=={__version__}: {command}", + # properties={ + # "custom_dimensions": {"configuration": str(asdict(self.config))} + # }, + # ) + + def run(self) -> None: + """ + Call this to load the configuration object, initialize the logging tree, + then invoke the subclass' `run_with_config` method and return the + appropriate exit code. This should be the entrypoint inside a command's + `if __name__ == "__main__"` block. + """ + config = load_configuration() + + log_level = "DEBUG" if config.verbose else "INFO" + logging.basicConfig(level=log_level, format=config.log_format) + + max_nb_cores = max(os.cpu_count() - 1, 1) # type: ignore + if config.number_of_cores_parallel <= 0 or config.number_of_cores_parallel > max_nb_cores: + self.nb_cores = max_nb_cores + else: + self.nb_cores = config.number_of_cores_parallel + + with self.emphasize(): + config_yaml = OmegaConf.to_yaml(config) + log.info("Final configuration being used:\n") + log.info(config_yaml) + + self.config = config + self.run_with_config() + + self.display_all_statuses() + + failed = bool(self._errors) + + if failed: + log.error(f"Encountered {len(self._errors)} errors!") + + sys.exit(bool(self._errors)) + + @abstractmethod + def run_with_config(self) -> None: + """ + Run the subclasses command with the specified configuration object. + Before this method is invoked, there is no guarantee that `self.config` + will be populated; after it is invoked, that is guaranteed. + Implementations of this method should NOT mutate the logging tree in + any way. They should also NOT raise any exceptions; rather they should + call the `register_error` method, which will ensure non-zero exit code. + Implementations can raise specific "status information" (e.g., a + component is not "active") by calling `register_component_status`. + """ + pass diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/config.yml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/config.yml new file mode 100644 index 000000000000..8883f0ed053c --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/config.yml @@ -0,0 +1,35 @@ +# signing_mode: aml +# activation_method: all +# compliant_branch: ^refs/heads/main$ +# component_specification_glob: 'steps/**/module_spec.yaml' +# log_format: '[%(name)s] [%(levelname)s] - %(message)s' +# workspaces: +# - /subscriptions/2d385bf4-0756-4a76-aa95-28bf9ed3b625/resourceGroups/sdkv2-20240925-rg/providers/Microsoft.MachineLearningServices/workspaces/sdkv2-20240925-ws +# fail_if_version_exists: False +# use_build_number: False +# APPLICATIONINSIGHTS_INSTRUMENTATIONKEY: 'fb916da6-f377-4116-b81b-b103b1e4fb26' + +# Define environment variables +# environment: +# BUILD_SOURCEBRANCH: $(Build.SourceBranch) +# BUILD_BUILDID: $(Build.BuildId) +# BUILD_DEFINITIONNAME: $(Build.DefinitionName) + + +activation_method: all +compliant_branch: ^refs/heads/develop$ +component_specification_glob: '**/*.yaml' +log_format: '[%(name)s][%(levelname)s] - %(message)s' +signing_mode: aml +workspaces: + - /subscriptions/2d385bf4-0756-4a76-aa95-28bf9ed3b625/resourceGroups/sdkv2-20240925-rg/providers/Microsoft.MachineLearningServices/workspaces/sdkv2-20240925-ws +allow_duplicate_versions: True +use_build_number: False +working_directory: 'C:\Projects\azure-sdk-for-python\sdk\ml\azure-ai-ml\azure\ai\ml\YAMLsigning\yamlSignTest' +suppress_adding_repo_pr_tags: True +# strict component validation +enable_component_validation: True +component_validation: + '$.name': '^office.smartcompose.[A-Za-z0-9-_.]+$' + '$.environment.docker.image': '^$|^polymerprod.azurecr.io*$' + '$.inputs..description': '^[A-Z].*' \ No newline at end of file diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/configuration.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/configuration.py new file mode 100644 index 000000000000..183e3e061161 --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/configuration.py @@ -0,0 +1,214 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from dataclasses import asdict, dataclass, field, replace +import logging +import os +from omegaconf.omegaconf import OmegaConf +import sys +from typing import Any, Dict, List +import warnings + + +log = logging.getLogger(__name__) + + +# Freeze single "empty list" so by-reference comparison of default values works. +_EMPTY_LIST = [] + + +@dataclass(frozen=True) +class Configuration: + # TODO: should this be handled via enum? + activation_method: str = field(default="all") + compliant_branch: str = field(default="^refs/heads/main$") + source_branch: str = field(default="") + component_cli_version: str = field(default="0.9.13") + component_specification_glob: str = field(default="**/spec.yaml") + # TODO: consider a way of supporting both this and `*.yaml` as defaults. + configuration_file: str = field(default="aml-build-configuration.yml") + log_format: str = field(default="%(message)s") + # Registration in registries is surprisingly slow. + shell_command_timeout_in_seconds: int = field(default=1000) + number_of_cores_parallel: int = field(default=0) + # TODO: should this be handled via enum? + signing_mode: str = field(default="aml") + verbose: bool = field(default=False) + working_directory: str = field(default_factory=lambda: os.getcwd()) + workspaces: List[str] = field( + default_factory=lambda: _EMPTY_LIST, metadata={"nargs": "*"} + ) + makecat_directory: str = field(default=r"C:\Program Files (x86)\Windows Kits") + makecat_default: str = field(default=r"10\bin\x64\makecat.exe") + # allow_duplicate_versions is on path to deprecation. Please avoid using it + allow_duplicate_versions: bool = field(default=False) + fail_if_version_exists: bool = field(default=False) + use_build_number: bool = field(default=False) + all_component_version: str = field(default="") + disable_telemetry: bool = field(default=False) + suppress_adding_repo_pr_tags: bool = field(default=False) + enable_component_validation: bool = field(default=False) + fail_if_pattern_not_found_in_component_validation: bool = field(default=False) + component_validation: dict = field(default_factory=dict) + dependency_hints: dict = field(default_factory=dict) + registries: List[str] = field( + default_factory=lambda: _EMPTY_LIST, metadata={"nargs": "*"} + ) + detect_changes_in_unzipped_folder: bool = field(default=False) + validation_workspace: str = field(default="") + + +def load_configuration() -> Configuration: + """ + Create configuration object from "implicit" command line arguments and + environment variables. + """ + # Strip away the first argument, which is the name of the file being + # executed. + args = sys.argv[1:] + env = os.environ + rv = load_configuration_from_args_and_env(args, dict(env)) + return rv + + +def load_configuration_from_args(args) -> dict: + """ + Load a "minimal" configuration dictionary from command line arguments. This + strips away any values which are default, so that merging with the default + and file-based configuration objects works properly. + """ + from argparse_dataclass import ArgumentParser + + default_config = Configuration() + parser = ArgumentParser(Configuration) + + cli_config = parser.parse_args(args) + # Strangely, calling `asdict` changes the object reference for the value + # if it is an empty array. + cli_config_vars = asdict(cli_config) + + for key in list(cli_config_vars.keys()): + + # Compare by reference so that you can override with default values like + # the empty list: https://stackoverflow.com/a/14080980. + if getattr(cli_config, key) is getattr(default_config, key): + del cli_config_vars[key] + + return cli_config_vars + + +def load_configuration_from_args_and_env( + args: List[str], env: Dict[str, Any] +) -> Configuration: + """ + Load configuration file from provided command line arguments and environment + variables. + + Priority is documented at https://aka.ms/aml/amlbuild , from lowest to + highest: + - default value + - configuration file + - environment variables + - command line arguments (highest priority) + """ + # Create default config + default_config = Configuration() + + # Load config from command line + cli_config = load_configuration_from_args(args) + + # Load config parameters specified in environment variables + env_config = { + key.lower(): value + for key, value in env.items() + if key.lower() in asdict(default_config).keys() + } + print(f"Load the config in the environment variables: {env_config}") + + # Merge cli config and env config + # Priority: cli > env + if env_config: + print( + "Merge the config in the environment variables with the config in the command line." + ) + cli_config = OmegaConf.merge(env_config, cli_config) + + working_directory = ( + cli_config.get("working_directory") or default_config.working_directory # type: ignore + ) + + cli_config_path = cli_config.get("configuration_file") # type: ignore + file_config = None + if cli_config_path is not None: + try: + print("Loading user provided configuration file") + file_config = OmegaConf.load(cli_config_path) + except FileNotFoundError: + print( + f"***ERROR: the configuration file path provided {cli_config_path} does not exist in your working directory {working_directory}, so both preparation and registration will fail." + ) + elif os.path.isfile(default_config.configuration_file): + print( + "Configuration file does not exist. Loading default configuration file aml-build-configuration.yml.", + ) + file_config = OmegaConf.load(default_config.configuration_file) + else: + warnings.warn( + "User provided/default configuration file does not exist. Using default configuration.", + UserWarning, + ) + + if file_config is None: + print("Configuration file is empty. Using default configuration.") + cli_and_file_config = cli_config + else: + print("Overriding default configuration by configuration file.") + cli_and_file_config = OmegaConf.merge(file_config, cli_config) + + if cli_and_file_config.get("workspaces") is None: # type: ignore + log.error( + "Workspace is not configured. Please update in your configuration file." + ) + + if cli_and_file_config.get("allow_duplicate_versions") is not None: # type: ignore + if cli_and_file_config.get("fail_if_version_exists") is None: # type: ignore + cli_and_file_config.update( + { + "fail_if_version_exists": not cli_and_file_config.get( + "allow_duplicate_versions" # type: ignore + ) + } + ) + warnings.warn( + "We recommend against using the parameter allow_duplicate_versions. Please specify fail_if_version_exists instead.", + UserWarning, + ) + else: + raise ValueError( + "Please don't specify both allow_duplicate_versions and fail_if_version_exists. Check out https://aka.ms/aml/amlbuild for more information." + ) + print("Please refer to https://aka.ms/aml/amlbuild for more information.") + + config = OmegaConf.merge(default_config, cli_and_file_config) + config = Configuration(**config) # type: ignore + + # Load the environment variable of source branch into config + if "BUILD_SOURCEBRANCH" in env.keys(): + config = replace(config, source_branch=env["BUILD_SOURCEBRANCH"]) + else: + warnings.warn("BUILD_SOURCEBRANCH is not in the environment variable list.") + + # Load the environment variable of build number into config, if user_build_number=True + if config.use_build_number: + if "BUILD_BUILDNUMBER" in env.keys(): + if config.all_component_version: + log.warning( + f"The build number {env['BUILD_BUILDNUMBER']} overwrites the value of all_component_version {config.all_component_version}" + ) + config = replace(config, all_component_version=env["BUILD_BUILDNUMBER"]) + else: + raise ValueError( + "BUILD_BUILDNUMBER is not in the environment variable list." + ) + + return config diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py new file mode 100644 index 000000000000..a0b1e7445e25 --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py @@ -0,0 +1,129 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +import os +import hashlib +import logging +# from opencensus.ext.azure.log_exporter import AzureLogHandler + +log = logging.getLogger(__name__) + + +def create_catalog_stub(): + """ + Function that creates a json stub of the form: {'HashAlgorithm': 'SHA256', 'CatalogItems': {}}. + """ + json_stub = {} + json_stub["HashAlgorithm"] = "SHA256" + json_stub["CatalogItems"] = {} + return json_stub + + +def create_SHA_256_hash_of_file(file): + """ + Function that returns the SHA 256 hash of 'file'.\n + Logic taken from https://www.quickprogrammingtips.com/python/how-to-calculate-sha256-hash-of-a-file-in-python.html + """ + sha256_hash = hashlib.sha256() + with open(file, "rb") as f: + # Read and update hash string value in blocks of 4K + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + # Converting to upper case because that's what is required by the policy + # service. See their code: + # https://dev.azure.com/msasg/Bing_and_IPG/_git/Aether?path=/src/aether/platform/backendV2/BlueBox/PolicyService/Microsoft.MachineLearning.PolicyService/Workers/CatalogValidation.cs + return sha256_hash.hexdigest().upper() + + +def add_file_to_catalog(file_for_catalog, catalog, absolute_path_to_remove): + """ + Function that adds an entry for 'file_for_catalog' to the 'catalog'.\n + Specifically, {: } will be added to the "CatalogItems" dictionary of the 'catalog' json, where is computed with the create_SHA_256_hash_of_file() function, and is obtained by removing 'absolute_path_to_remove' from the full 'file_for_catalog' path + """ + hash_of_file = create_SHA_256_hash_of_file(file_for_catalog) + relative_path = file_for_catalog.split(absolute_path_to_remove)[1] + catalog["CatalogItems"][relative_path] = hash_of_file + return catalog + + +def write_two_catalog_files(catalog, path): + """ + Function that writes 'catalog' into 2 duplicate files: "path/config.json" and "path/config.json.sig". + """ + with open(os.path.join(path, "catalog.json"), "w") as jsonFile1: + json.dump(catalog, jsonFile1) + with open(os.path.join(path, "catalog.json.sig"), "w") as jsonFile2: + json.dump(catalog, jsonFile2) + + +def delete_two_catalog_files(path): + """ + Function that deletes the "catalog.json" and "catalog.json.sig" files located at 'path', if they exist + """ + # catalog.json + file_path_json = os.path.join(path, "catalog.json") + if os.path.exists(file_path_json): + log.warning(f"{file_path_json} already exists. Deleting it") + os.remove(file_path_json) + # catalog.json.sig + file_path_json_sig = os.path.join(path, "catalog.json.sig") + if os.path.exists(file_path_json_sig): + log.warning(f"{file_path_json_sig} already exists. Deleting it") + os.remove(file_path_json_sig) + + +# class TelemetryLogger: +# """Utils class for opencensus azure monitor""" + +# def __init__( +# self, enable_telemetry=True, instrumentation_key=None, level=logging.INFO +# ): +# self.logger = logging.getLogger("telemetry_logger") +# self.logger.setLevel(level) +# self.enable_telemetry = enable_telemetry +# # Why is it okay to include this key directly in the source code? +# # For any client-side tool, there is a fundamental problem with protecting instrumentation +# # keys. You want the published tool to be able to collect telemetry, but the only way +# # it can do this is if it has some kind of instrumentation key. +# # +# # For an authoritative example, the dotnet CLI contains their telemetry key in a +# # public GitHub repository: +# # https://github.com/dotnet/cli/blob/master/src/dotnet/Telemetry/Telemetry.cs +# # +# # The underlying Azure resource is called `aml1p-ml-tooling`. +# self.instrumentation_key = ( +# "aaefce9e-d109-4fac-bb9f-8277c68e91ac" +# if instrumentation_key is None +# else instrumentation_key +# ) +# handler = AzureLogHandler( +# connection_string=f"InstrumentationKey={self.instrumentation_key}" +# ) +# handler.add_telemetry_processor(self.scrubber_function) +# self.logger.addHandler(handler) + +# def log_trace(self, message, properties={}, level=logging.INFO): +# if self.enable_telemetry: +# try: +# if level == logging.INFO: +# self.logger.info(message, extra=properties) +# elif level == logging.WARNING: +# self.logger.warning(message, extra=properties) +# elif level == logging.ERROR: +# self.logger.error(message, extra=properties) +# elif level == logging.CRITICAL: +# self.logger.critical(message, extra=properties) +# else: +# log.error("The logging level is not expected!") +# except Exception as ex: +# log.warning("Send telemetry exception: %s", str(ex)) +# else: +# log.info( +# "Sending trace log messages to application insight has been disabled." +# ) + +# # Callback function to scrub some columns +# def scrubber_function(self, envelope): +# envelope.tags['ai.cloud.roleInstance'] = 'cloud_RoleInstance_Scrubbed' +# envelope.tags["ai.location.ip"] = "IP_Scrubbed" diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py new file mode 100644 index 000000000000..6c247885e6c4 --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py @@ -0,0 +1,1029 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import logging +import os +import sys +import multiprocessing +import collections +import jsonpath_ng +import chardet +import re +from typing import List, Set, Union +import shutil +from ruamel.yaml import YAML +from git import Repo, InvalidGitRepositoryError, NoSuchPathError + +from command_line import Command +from utils import ( + create_catalog_stub, + add_file_to_catalog, + write_two_catalog_files, + delete_two_catalog_files, +) +from pathlib import Path +import yaml +import urllib.parse +import uuid +from urllib.parse import urlparse +from enum import Enum + +log = logging.getLogger(__name__) + +ALLOWED_CONTAINER_REGISTRIES = ["polymerprod.azurecr.io"] +ALLOWED_PACKAGE_FEEDS = [ + "https://o365exchange.pkgs.visualstudio.com/_packaging/PolymerPythonPackages/pypi/simple/" +] + + +class RuntimeEnvironment(Enum): + AZURE_DEVOPS_BUILD = "Azure DevOps Build" + GITHUB_ACTION = "GitHub Action" + OTHER = "Other" + + +class OperatingSystem(Enum): + WINDOWS = "Windows" + LINUX = "Linux" + OTHER = "Other" + + +class TargetType(Enum): + ADDITIONAL_INCLUDES = "additional_includes" + DEPENDENCY_HINTS = "dependency_hints" + + +class ActionType(Enum): + VALIDATE = "validate" + BUILD = "build" + + +class Prepare(Command): + def __init__(self): + super().__init__() + self._component_statuses = {} + + def folder_path(self, file: str) -> str: + """ + Return the normalized path of the directory containing a file. + """ + return self.normalize_path(Path(file).parent, directory=True) + + def all_files_in_snapshot(self, manifest: str) -> List[str]: + """ + Return a list of all normalized files in the snapshot. The input + (`manifest`) is assumed to be some file, whether AML-style component + spec or Aether-style auto-approval manifest, in the "root" of the + snapshot. + """ + folder_path = self.folder_path(manifest) + log.info("Absolute path for current component is: " + folder_path) + + # Generate a list of all files in this components folder (including subdirectories) + rv = [] + + # Make sure we pick up Linux-style "hidden" files like .amlignore and + # hidden "directories", as well as hidden files in hidden directories. + # https://stackoverflow.com/a/65205404 + # https://stackoverflow.com/a/41447012 + for root, _, file_paths in os.walk(folder_path): + for file in file_paths: + file_path = os.path.join(root, file) + normalized_path = self.normalize_path(file_path) + rv.append(normalized_path) + + return rv + + def process_all_components(self, files: List[str], action_type) -> List[str]: # type: ignore + """ + Depending on the 'action_type' ('build' or 'validate'), run + build_each_components() or validate_each_component() + in parallel with nb_cores threads. + Returns the list of "built" component files, or an empty list + in the validate case. + """ + rv = [] + if files: + nb_cores = self.nb_cores + + if nb_cores == 1: + for component in files: + if action_type == ActionType.BUILD: + rv.append(self.build_each_components(component)) + elif action_type == ActionType.VALIDATE: + self.validate_each_components(component) + else: + log.error( + f"Unknown value for action_type: '{action_type}'. It should be either '{ActionType.BUILD}' or '{ActionType.VALIDATE}'" + ) + else: + log.info( + f"Batch component processing with {nb_cores} threads. Action: '{action_type}'" + ) + pool = multiprocessing.Pool(processes=nb_cores) + if action_type == ActionType.BUILD: + rv = pool.map(self.build_each_components, files) + elif action_type == ActionType.VALIDATE: + pool.map(self.validate_each_components, files) + pool.close() + pool.join() + return rv # type: ignore + + def build_each_components(self, component) -> List[str]: + """ + For one of component specification file, run `az ml component build`, + and register the status (+ register error if build failed). + """ + path = Path(component) + rv = str(path.parent / ".build" / path.name) + build_component_success = self.execute_azure_cli_command( + f"ml component build --file {component}" # TODO + ) + if build_component_success: + log.info(f"Component {component} is built.") + else: + self.register_error(f"Error when building component {component}.") + return rv # type: ignore + + def find_component_specification_files_using_all(self, dir=None) -> List[str]: + """ + Find all component specification files in the configured working + directory matching the configured glob. Return the absolute paths + of these files in the format of a list of string. + """ + import pdb; pdb.set_trace(); + if dir is None: + # dir = self.config.working_directory + dir = 'C:\Projects\\azure-sdk-for-python\sdk\ml\\azure-ai-ml\\azure\\ai\ml\YAMLsigning\yamlSignTest' + all_spec_yaml_files_absolute_paths = [ + str(p.absolute()) + for p in Path(dir).glob(self.config.component_specification_glob) + ] + + return all_spec_yaml_files_absolute_paths + + def find_component_specification_files_using_smart(self) -> List[str]: + """ + This function returns the list of components (as a list of absolute paths) potentially affected by the latest commit. + """ + log.info( + "Determining which components are potentially affected by the current change." + ) + [repo, current_branch, compliant_branch] = self.identify_repo_and_branches() + modified_files = self.get_modified_files(repo, current_branch, compliant_branch) + active_components = self.infer_active_components_from_modified_files( + modified_files + ) + return active_components + + def identify_repo_and_branches(self): + """ + This function returns the current repository, along with the name of the current and compliant branches [repo, current_branch, compliant_branch]. Throws if no repo can be found. + """ + # identify the repository + curr_path = Path(self.config.working_directory).resolve() + try: + repo = Repo(curr_path, search_parent_directories=True) + log.info("Found a valid repository in " + repo.git_dir) + except (InvalidGitRepositoryError, NoSuchPathError): + message = ( + str(curr_path) + + " or its parents do not contain a valid repo path or cannot be accessed." + ) + raise Exception(message) + try: + current_branch = str( + repo.head.ref + ) # when running from our build the repo head is detached so this will throw an exception + except TypeError: + current_branch = os.environ.get("BUILD_SOURCEBRANCH") or os.environ.get( + "GITHUB_REF" + ) + log.info("The current branch is: '" + str(current_branch) + "'.") + # Identify the compliant branch + if not (self.config.compliant_branch.startswith("^refs/heads/")) or not ( + self.config.compliant_branch.endswith("$") + ): + raise Exception( + "The name of the compliant branch found in the config file should start with '^refs/heads/' and end with '$'. Currently it is: '" + + self.config.compliant_branch + + "'." + ) + else: + compliant_branch = self.config.compliant_branch.replace("^refs/heads/", "")[ + 0:-1 + ] + log.info("The compliant branch is: '" + compliant_branch + "'.") + return [repo, current_branch, compliant_branch] + + def get_modified_files(self, repo, current_branch, compliant_branch) -> Set[str]: + """ + This function returns the paths of files that have been modified. 3 scenarios are supported.\n + 1/ 'Build - before Merge'; when the 'prepare' command is run as part of a build, but before the actual merge (in this case, the name of the current branch starts with 'refs/pull/' - this is the default Azure DevOps behavior).\n + 2/ 'Build - after Merge'; when the 'prepare' command is run as part of a build, after the actual merge (in this case, the name of the current branch is the same as the name of the compliant branch).\n + 3/ 'Manual'; when the prepare command is run manually (typically before publishing the PR). + """ + # identify the 2 relevant commits based on the use case + current_commit, previous_commit = self.get_relevant_commits( + repo, current_branch, compliant_branch + ) + + # take the actual diff + diff = self.get_diff_between_commits(current_commit, previous_commit) + + # process the diff object to obtain a list of paths + res = self.extract_paths_from_diff( + diff, + repo_working_dir=repo.working_dir, + repo_working_tree_dir=repo.working_tree_dir, + repo_git_dir=repo.git_dir, + ) + return res + + def extract_paths_from_diff( + self, diff, repo_working_dir, repo_working_tree_dir, repo_git_dir + ): + """Function that extracts the paths of the modified files from the diff between 2 commits.""" + res = set() + # let's build a set with the paths of modified files found in the diff object + log.debug("Working directory: " + self.config.working_directory) + log.debug("repo.working_dir: " + repo_working_dir) + log.debug("repo.working_tree_dir: " + repo_working_tree_dir) + log.debug("repo.git_dir: " + repo_git_dir) + for d in diff: + log.debug("d.a_path: " + d.a_path) + log.debug("Path(d.a_path).absolute(): " + str(Path(d.a_path).absolute())) + log.debug("Path(d.a_path).resolve(): " + str(Path(d.a_path).resolve())) + r_a = str(Path(repo_git_dir).parent / Path(d.a_path)) + res.add(r_a) + r_b = str(Path(repo_git_dir).parent / Path(d.b_path)) + res.add(r_b) + log.info("The list of modified files is:") + log.info(res) + return res + + def get_relevant_commits(self, repo, current_branch, compliant_branch): + """ + This function returns the commits required to compute the list of files that have been modified. 3 scenarios are supported.\n + 1/ 'Build - before Merge'; when the 'prepare' command is run as part of a build, but before the actual merge (in this case, the name of the current branch starts with 'refs/pull/' - this is the default Azure DevOps behavior).\n + 2/ 'Build - after Merge'; when the 'prepare' command is run as part of a build, after the actual merge (in this case, the name of the current branch is the same as the name of the compliant branch).\n + 3/ 'Manual'; when the prepare command is run manually (typically before publishing the PR). + """ + # Grab the diff differently depending on the scenario + if current_branch.replace("refs/heads/", "") == compliant_branch: + # 'Build - after Merge' case: we will take the diff between the + # tree of the latest commit to the compliant branch, and the tree + # of the previous commit to the compliant branch corresponding to a + # PR (we assume the commit summary starts with 'Merged PR') + log.info( + "We are in the 'Build - after Merge' case (the current branch is the compliant branch)." + ) + current_commit = self.get_compliant_commit_corresponding_to_pull_request( + repo, compliant_branch + ) + self.log_commit_info(current_commit, "Current commit to compliant branch") + previous_commit = ( + self.get_previous_compliant_commit_corresponding_to_pull_request( + current_commit, + consider_current_commit=False, + ) + ) + self.log_commit_info( + previous_commit, "Previous PR commit to compliant branch" + ) + elif current_branch.startswith("refs/pull/"): + # 'Build - before Merge': we will take the diff between the tree of + # the current commit, and the tree of the previous commit to the + # compliant branch corresponding to a PR (we assume the commit + # summary starts with 'Merged PR') + log.info( + "We are in the 'Build - before Merge' case (the current branch is not the compliant branch and its name starts with 'refs/pull/')." + ) + current_commit = repo.commit() + self.log_commit_info(current_commit, "Current commit to current branch") + latest_commit_to_compliant_branch = repo.remotes.origin.refs[ + compliant_branch + ].commit + previous_commit = ( + self.get_previous_compliant_commit_corresponding_to_pull_request( + latest_commit_to_compliant_branch, + consider_current_commit=True, + ) + ) + self.log_commit_info( + previous_commit, "Previous PR commit to compliant branch" + ) + else: + # 'Manual' Case: we will take the diff between the current branch + # and the compliant branch (we're assuming the compliant branch is + # locally up to date here) + log.info( + "We are in the 'Manual' case (the current branch is NOT the compliant branch and its name does not start with 'refs/pull/')." + ) + try: + current_commit = repo.heads[ + current_branch + ].commit # this won't work when running the Manual case from the DevOps portal, but the below will + except (IndexError, AttributeError): + current_commit = repo.commit() + self.log_commit_info(current_commit, "Current commit to current branch") + try: + previous_commit = repo.heads[ + compliant_branch + ].commit # this won't work when running the Manual case from the DevOps portal, but the below will + except (IndexError, AttributeError): + latest_commit_to_compliant_branch = repo.remotes.origin.refs[ + compliant_branch + ].commit + previous_commit = ( + self.get_previous_compliant_commit_corresponding_to_pull_request( + latest_commit_to_compliant_branch, + consider_current_commit=True, + ) + ) + self.log_commit_info(previous_commit, "Previous commit to compliant branch") + + return current_commit, previous_commit + + def get_diff_between_commits(self, current_commit, previous_commit): + """Function that gets the diff between 2 commits.""" + # just use the 'diff' function from gitpython + return current_commit.tree.diff(previous_commit.tree) + + def log_commit_info(self, commit, title) -> None: + log.info(title + ":") + log.info("Summary: " + commit.summary) + log.info("Author: " + str(commit.author)) + log.info("Authored Date: " + str(commit.authored_date)) + + def get_previous_compliant_commit_corresponding_to_pull_request( + self, latest_commit, consider_current_commit + ): + """ + This function will return the previous commit in the repo corresponding to a PR (i.e. that starts with "Merged PR"). + If `consider_current_commit` is set to True, the `latest_commit` will be considered. If set to false, only previous commits will be considered. + """ + target_string = "Merged PR" + if consider_current_commit and latest_commit.summary.startswith(target_string): + return latest_commit + previous_commit = latest_commit + for c in previous_commit.iter_parents(): + if c.summary.startswith(target_string): + previous_commit = c + break + return previous_commit + + def get_compliant_commit_corresponding_to_pull_request( + self, repo, compliant_branch + ): + """ + This function will return the most recent commit in the repo that truly corresponds to the triggered build. It is identified thanks to the 'Build.SourceVersionMessage' DevOps environment variable (see https://docs.microsoft.com/en-us/azure/devops/pipelines/build/variables?view=azure-devops&tabs=yaml) that contains the true commit message. This is used to address the race condition occurring when a commit sneaks in before the "prepare" step was run on the previous commit. + """ + # this is the true commit message corresponding to the PR that triggered the build + true_commit_message = self.get_true_commit_message() + # this is the most recent commit + current_commit = repo.remotes.origin.refs[compliant_branch].commit + # if the most recent commit corresponds to the true commit message, then return it + if true_commit_message.startswith(current_commit.summary): + return current_commit + # otherwise, let's iterate through the parents until we find it + candidate_commit = current_commit + for c in candidate_commit.iter_parents(): + if true_commit_message.startswith(c.summary): + return c + # if the corresponding commit cannot be found, return the most recent one and log a warning + log.warning( + "Could not find in the git repo the commit that triggered this PR. Returning the most recent but beware, the 'smart' mode likely will not work properly." + ) + return current_commit + + def get_true_commit_message(self): + return str(os.environ.get("BUILD_SOURCEVERSIONMESSAGE") or "NA") + + def infer_active_components_from_modified_files(self, modified_files) -> List[str]: + """ + This function returns the list of components (as a list of directories paths) potentially affected by changes in the `modified_files`. + """ + rv = [] + # We will go over components one by one + all_components_in_repo = self.find_component_specification_files_using_all() + log.info("List of all components in repo:") + log.info(all_components_in_repo) + for component in all_components_in_repo: + if self.component_is_active(component, modified_files): + rv.append(component) + # No need to dedup rv since we are only considering components once + log.info("The active components are:") + log.info(rv) + return rv + + def component_is_active(self, component, modified_files) -> bool: + """ + This function returns True if any of the 'modified_files' potentially affects the 'component' (i.e. if it is directly in one of the 'component' subfolders, or if it is covered by the additional_includes files). If the component has been deleted, returns False. + """ + log.info("Assessing whether component '" + component + "' is active...") + # Let's first take care of the case where the component has been deleted + if not (Path(component).exists()): + return False + # Let's grab the contents of the additional_includes file if it exists. + component_additional_includes_contents = self.get_target_file_contents( + component, + TargetType.ADDITIONAL_INCLUDES, + ) + # Let's grab the contents of the additional_includes file if it exists. + component_dependency_hints_contents = self.get_target_file_contents( + component, + TargetType.DEPENDENCY_HINTS, + ) + # loop over all modified files; if current file is in subfolder of component or covered by + # additional includes or dependency hints, return True + for modified_file in modified_files: + if ( + self.is_in_subfolder(modified_file, component) + or self.is_in_target_list( + modified_file, + TargetType.ADDITIONAL_INCLUDES, + component_additional_includes_contents, + ) + or self.is_in_target_list( + modified_file, + TargetType.DEPENDENCY_HINTS, + component_dependency_hints_contents, + ) + ): + return True + return False + + def get_target_file_contents( + self, component, target_type + ) -> Union[List[str], None]: + component_target_file_contents = None + # for depependency hints, we look globally first + if target_type == TargetType.DEPENDENCY_HINTS: + component_target_file_contents = self.get_global_dependency_hints_contents( + component + ) + # First we figure out the expected path of the additional_includes file + component_target_file_path = self.get_theoretical_target_file_path( + component, target_type + ) + # And we load it if it exists. + if Path(component_target_file_path).exists(): + rbfile = open(component_target_file_path, "rb").read() + if chardet.detect(rbfile).get("encoding").lower() not in ["utf-8", "ascii"]: #type: ignore + raise ValueError( + f"Encoding of a file: '{{spec_file_name}}.{target_type.value}' not supported, use UTF-8." + ) + + with open(component_target_file_path, "r") as component_target_file: + if (target_type == TargetType.DEPENDENCY_HINTS) and ( + component_target_file_contents is not None + ): + component_target_file_contents += component_target_file.readlines() + else: + component_target_file_contents = component_target_file.readlines() + else: + # If additional_includes doesn't exist we log a message explaining the expected name format + if target_type == TargetType.ADDITIONAL_INCLUDES: + log.info( + "No additional_includes file could be found for the component '" + + component + + "'. If you tried to create such a file, remember it should live next to the component spec file and should be named '{spec_file_name}.additional_includes'. " + + "For example, if the component spec file is named 'component_spec.yaml', the additional_includes file should be named 'component_spec.additional_includes'. In this specific case, the expected additional_includes file name is: '" + + component_target_file_path + + "'." + ) + # Then we check whether there is an improperly named additional_includes file in the component folder, and if so we throw + if self.check_for_wrongly_named_additional_includes(component): + raise ValueError( + "An additional_includes file which does not respect the naming pattern was found. Please rename this file. Remember it should live next to the component spec file and should be named '{spec_file_name}.additional_includes'." + + "For example, if the component spec file is named 'component_spec.yaml', the additional_includes file should be named 'component_spec.additional_includes'." + ) + # Before returning, we make the paths in the additional_includes file absolute + if component_target_file_contents: + for line_number in range(0, len(component_target_file_contents)): + component_target_file_contents[line_number] = str( + Path( + os.path.join( + Path(component).parent, + component_target_file_contents[line_number].rstrip("\n"), + ) + ).resolve() + ) + return component_target_file_contents + + def get_global_dependency_hints_contents(self, component) -> Union[List[str], None]: + if len(self.config.dependency_hints) > 0: + global_dependency_hints_contents = [] + for ( + component_folder_paths, + dependency_hints_paths, + ) in self.config.dependency_hints.items(): + component_folder_absolute_paths = [ + str(p.absolute().resolve()) + for p in Path(self.config.working_directory).glob( + component_folder_paths + ) + ] + if ( + str(Path(component).parent.resolve()) + in component_folder_absolute_paths + ): + if not isinstance(dependency_hints_paths, list): + dependency_hints_paths = [dependency_hints_paths] + for dependency_hints_path in list(dependency_hints_paths): + global_dependency_hints_contents += [ + str(p.absolute().resolve()) + for p in Path(self.config.working_directory).glob( + dependency_hints_path + ) + ] + if len(global_dependency_hints_contents) > 0: + return global_dependency_hints_contents + else: + return None + else: + return None + + def get_theoretical_target_file_path(self, component, target_type) -> str: + """ + Returns the expected path of the 'target_type' file + associated with the 'component'. + """ + # First, we figure out the name of the target file, based on the component name + component_name_without_extension = Path(component).name.split(".yaml")[0] + # Then, we construct the expected path of the target file (see + # https://componentsdk.azurewebsites.net/components/component-spec-topics/additional-includes.html + # for the 'additional_includes' case) + component_target_file_path = os.path.join( + Path(component).parent, + component_name_without_extension + "." + str(target_type.value), + ) + return component_target_file_path + + def check_for_wrongly_named_additional_includes(self, component) -> bool: + """ + Returns True if the component folder contains an improperly named additional_includes file + i.e. a lonely additional_includes file without a corresponding spec.yaml + """ + # grab all potential additional_includes files in the component folder + potentially_wrongly_named_files = Path(component).parent.glob( + "*.additional_includes*" + ) + for potentially_wrongly_named_file in potentially_wrongly_named_files: + # determine the expected location of the spec + theoretical_component_path = ( + str(potentially_wrongly_named_file)[:-20] + ".yaml" + ) + # check if spec exists + if os.path.isfile(theoretical_component_path): + continue + else: + # if not, we have a problem + self.register_error( + f"Component folder {component} contains a lonely additional includes file at {potentially_wrongly_named_file}, missing component spec {theoretical_component_path}" + ) + return True + return False + + def is_in_subfolder(self, modified_file, component) -> bool: + """ + This function returns True if 'modified_file' is in a subfolder of 'component' ('component' can be either the path to a file, or a directory). If the component has been deleted, returns False. + """ + # Let's first take care of the case where the component has been deleted + if not (Path(component).exists()): + log.debug("'" + component + "' does not exist, returning False.") + return False + # Case where the component has not been deleted + for parent in Path(modified_file).parents: + if parent.exists(): + if Path(component).is_dir(): + if parent.samefile(Path(component)): + log.info( + "'" + + modified_file + + " is in a subfolder of '" + + component + + "'." + ) + return True + else: + if parent.samefile(Path(component).parent): + log.info( + "'" + + modified_file + + " is in a subfolder of '" + + component + + "'." + ) + return True + log.debug( + "'" + modified_file + " is NOT in a subfolder of '" + component + "'." + ) + return False + + def is_in_target_list( + self, modified_file, target_type, target_list_contents + ) -> bool: + """ + This function returns True if 'modified_file' is covered by the file + 'target_list_contents'. The 'target_type' can be either + additional_includes or dependency_hints + """ + # first tackle the trivial case of no target file + if target_list_contents is None: + log.debug( + f"The component's target file ({target_type}) is empty, returning False." + ) + return False + # now the regular scenario + for line in target_list_contents: + # when the line from the target list is a file, we directly check its path against that of modified_file + if Path(line).is_file(): + if str(Path(modified_file).resolve()) == str( + Path(line).resolve() + ): # can't use 'samefile' here because modified_file is not guaranteed to exist, we resolve the path and do basic == test + log.info( + "'" + + modified_file + + f" is directly listed in the {target_type} file." + ) + return True + # slightly more complicated case: when the line + # in the target list is a directory, we can just + # call the is_in_subfolder function + # but first, we take care of the zipped folders + if target_type == TargetType.ADDITIONAL_INCLUDES: + if self.config.detect_changes_in_unzipped_folder: + split_line = os.path.splitext(line) + if split_line[1] == ".zip": + line = split_line[0] + if Path(line).is_dir(): + if self.is_in_subfolder(modified_file, line): + log.info( + "'" + + modified_file + + f" is in one of the directories listed in the {target_type} file." + ) + return True + log.debug( + "'" + + modified_file + + f" is NOT referenced by the {target_type} file (neither directly nor indirectly)." + ) + return False + + + def find_component_specification_files(self) -> List[str]: + """ + Find the list of "active" component specification files using the + configured method ("all" or "smart"). + """ + activation_method = self.config.activation_method + + if activation_method == "all": + rv = self.find_component_specification_files_using_all() + elif activation_method == "smart": + rv = self.find_component_specification_files_using_smart() + else: + raise ValueError( + f"Invalid activation_method provided: '{activation_method}'" + ) + + return rv + + def _create_dependencies_files(self, component_files) -> str: + id = str(uuid.uuid4()) + path_to_dependencies_files = os.path.join( + self.config.working_directory, "component_dependencies_" + id + ) + log.info( + f"Writing Python package dependencies to path {path_to_dependencies_files}" + ) + os.makedirs(path_to_dependencies_files) + for component in component_files: + self._create_dependencies_files_for_single_component( + component, path_to_dependencies_files + ) + return id + + def _create_dependencies_files_for_single_component( + self, component, path_to_dependencies_files + ) -> None: + component_repo = Path(component).parent + with open(component, "r") as spec_file: + spec = YAML(typ="safe").load(spec_file) + ( + pip_dependencies, + conda_dependencies, + _, + ) = self._extract_dependencies_and_channels(component) + + if pip_dependencies or conda_dependencies: + component_name = spec.get("name") + cur_path = os.path.join(path_to_dependencies_files, component_name) + try: + os.makedirs(cur_path) + except FileExistsError: + suffix = ( + component_name + + "_" + + os.path.splitext(os.path.basename(component))[0] + ) + cur_path = os.path.join(path_to_dependencies_files, suffix) + os.makedirs(cur_path) + if pip_dependencies: + log.info( + f"Found pip dependencies for component {component_name} in {component_repo}. Writing to requirements.txt." + ) + with open(os.path.join(cur_path, "requirements.txt"), "w") as file: + for req in pip_dependencies: + file.write(req) + if not req.endswith("\n"): + file.write("\n") + if conda_dependencies: + log.info( + f"Found conda dependencies for component {component_name} in {component_repo}. Writing to environment.yml." + ) + with open(os.path.join(cur_path, "environment.yml"), "w") as file: + yaml.dump(conda_dependencies, file) + + def _extract_dependencies_and_channels(self, component) -> List[list]: + component_repo = Path(component).parent + build_folder = os.path.join(component_repo, ".build") + if os.path.exists(build_folder): + component_repo = build_folder + with open(component, "r") as spec_file: + spec = YAML(typ="safe").load(spec_file) + pip_dependencies = [] + conda_dependencies = [] + conda_channels = [] + if "environment" in spec: + spec_environment = spec.get("environment") + if "conda" in spec_environment: + spec_conda = spec_environment["conda"] + if "conda_dependencies" in spec_conda: + conda_dependencies = spec_conda["conda_dependencies"] + pip_dependencies += self._extract_python_package_dependencies( + conda_dependencies + ) + if "channels" in conda_dependencies: + conda_channels += conda_dependencies["channels"] + if "conda_dependencies_file" in spec_conda: + conda_dependencies_file = spec_conda["conda_dependencies_file"] + try: + with open( + os.path.join( + component_repo, spec_conda["conda_dependencies_file"] + ) + ) as file: + conda_dependencies = YAML(typ="safe").load(file) + pip_dependencies += self._extract_python_package_dependencies( + conda_dependencies + ) + if "channels" in conda_dependencies: + conda_channels += conda_dependencies["channels"] + except FileNotFoundError: + self.register_error( + f"The required conda_dependencies_file {conda_dependencies_file} does not exist in {component_repo}." + ) + if "pip_requirements_file" in spec_conda: + pip_requirements_file = spec_conda["pip_requirements_file"] + try: + with open( + os.path.join( + component_repo, spec_conda["pip_requirements_file"] + ) + ) as file: + pip_dependencies += file.readlines() + except FileNotFoundError: + self.register_error( + f"The required pip_requirements_file {pip_requirements_file} does not exist in {component_repo}." + ) + return [pip_dependencies, conda_dependencies, conda_channels] + + def _extract_python_package_dependencies(self, conda_dependencies) -> List[str]: + pip_dependencies = [] + if "dependencies" in conda_dependencies: + dependencies = conda_dependencies.get("dependencies") + for dependencies_item in dependencies: + if isinstance(dependencies_item, dict) and "pip" in dependencies_item: + pip_dependencies = dependencies_item["pip"] + return pip_dependencies + + def create_catalog_files(self, files: List[str]) -> None: + """ + Create AML-friendly catalog.json and catalog.json.sig files, using + SHA-256 hash. + """ + + # For each component spec file in the input list, we'll do the following... + for f in files: + log.info(f"Processing file {f}") + component_folder_path = self.folder_path(f) + + # remove catalog files if already present + log.info("Deleting old catalog files if present") + delete_two_catalog_files(component_folder_path) + + files_for_catalog = self.all_files_in_snapshot(f) + log.info("The following list of files will be added to the catalog.") + log.info(files_for_catalog) + + # Prepare the catlog stub: {'HashAlgorithm': 'SHA256', 'CatalogItems': {}} + catalog = create_catalog_stub() + + # Add an entry to the catalog for each file + for file_for_catalog in files_for_catalog: + catalog = add_file_to_catalog( + file_for_catalog, catalog, component_folder_path + ) + + # order the CatalogItems dictionary + catalog["CatalogItems"] = collections.OrderedDict( + sorted(catalog["CatalogItems"].items()) + ) + + # Write the 2 catalog files + log.info(catalog) + write_two_catalog_files(catalog, component_folder_path) + log.info("Finished creating catalog files.") + + def run_with_config(self): + log.info("Running component preparation logic.") + + # self.telemetry_logging(command="prepare") + + component_files = self.find_component_specification_files() + # if not self.config.suppress_adding_repo_pr_tags: # TODO + # try: + # component_files = self.add_repo_and_last_pr_to_tags(component_files) + # except StopIteration: + # log.warning( + # "`add_repo_and_last_pr_to_tags` not successful. Please make sure your component files are in Git. Otherwise, please set `suppress_adding_repo_pr_tags` to True." + # )# TODO + + import pdb;pdb.set_trace() + # section + # print("run catalog creation", component_files) + # self.create_catalog_files(component_files) + # over + + + self.ensure_component_cli_installed() + self.attach_workspace() + self.process_all_components(component_files, ActionType.VALIDATE) + built_component_files = self.process_all_components( + component_files, ActionType.BUILD + ) + + # only call self.create_catalog_files if built_component_files not None + if built_component_files: + self.create_catalog_files(built_component_files) + + self._create_dependencies_files(component_files) + + # self.add_conda_to_system_path() # TODO + + def validate_each_components(self, component) -> None: + """ + For one of component specification file, run `az ml component validate`, + run compliance and customized validation if enabled, + and register the status (+ register error if validation failed). + """ + validate_component_success = self.execute_azure_cli_command( + f"ml component validate --file {component}" + ) + compliance_validation_success = True + customized_validation_success = True + if self.config.enable_component_validation: + log.info(f"Running compliance validation on {component}") + compliance_validation_success = self.compliance_validation(component) + if len(self.config.component_validation) > 0: + log.info(f"Running customized validation on {component}") + for jsonpath, regex in self.config.component_validation.items(): + customized_validation_success = ( + customized_validation_success + if self.customized_validation( + jsonpath, + regex, + component, + self.config.fail_if_pattern_not_found_in_component_validation, + ) + else False + ) + + if ( + validate_component_success + and compliance_validation_success + and customized_validation_success + ): + # If the az ml validation succeeds, we continue to check whether + # the "code" snapshot parameter is specified in the spec file + # https://componentsdk.z22.web.core.windows.net/components/component-spec-topics/code-snapshot.html + with open(component, "r") as spec_file: + spec = YAML(typ="safe").load(spec_file) + spec_code = spec.get("code") + if spec_code and spec_code not in [".", "./"]: + self.register_component_status(component, "validate", "failed") + self.register_error( + "Code snapshot parameter is not supported. Please use .additional_includes for your component." + ) + else: + log.info(f"Component {component} is valid.") + self.register_component_status(component, "validate", "succeeded") + else: + self.register_component_status(component, "validate", "failed") + self.register_error(f"Error when validating component {component}.") + + def compliance_validation(self, component: str) -> bool: + """ + This function checks whether a given component spec YAML file + meets all the requirements for running in the compliant AML. + Specifically, it checks (1) whether the image URL is compliant; + (2)whether the pip index-url is compliant; (3) whether + "default" is only Conda channel + """ + with open(component, "r") as spec_file: + spec = YAML(typ="safe").load(spec_file) + + # Check whether the docker image URL is compliant + image_url = jsonpath_ng.parse("$.environment.docker.image").find(spec) + if len(image_url) > 0: + if ( + urlparse(image_url[0].value).path.split("/")[0] + not in ALLOWED_CONTAINER_REGISTRIES + ): + log.error( + f"The container base image in {component} is not allowed for compliant run." + ) + return False + + # check whether the package feed is compliant + ( + package_dependencies, + conda_dependencies, + conda_channels, + ) = self._extract_dependencies_and_channels(component=component) + if len(package_dependencies) > 0: + for dependency in package_dependencies: + if re.match("^--index-url", dependency) or re.match( + "^--extra-index-url", dependency + ): + if dependency.split(" ")[1] not in ALLOWED_PACKAGE_FEEDS: + log.error( + f"The package feed in {component} is not allowed for compliant run." + ) + return False + if ( + f"--index-url {ALLOWED_PACKAGE_FEEDS[0]}" not in package_dependencies + and f"--extra-index-url {ALLOWED_PACKAGE_FEEDS[0]}" + not in package_dependencies + ): + log.error( + f"The Polymer package feed is not found in environment of {component}" + ) + return False + + # Check whether "default" is only Conda channel + if len(conda_channels) > 1 or ( + len(conda_channels) == 1 and conda_channels[0] != "." + ): + log.error("Only the default conda channel is allowed for compliant run.") + return False + + return True + + @staticmethod + def customized_validation( + jsonpath: str, + regex: str, + component: str, + fail_if_pattern_not_found_in_component_validation: bool, + ) -> bool: + """ + This function leverages regular expressionm atching and + JSONPath expression to enforce user-provided "strict" + validation on Azure ML components + """ + with open(component, "r") as spec_file: + spec = YAML(typ="safe").load(spec_file) + + parsed_patterns = jsonpath_ng.parse(jsonpath).find(spec) + validation_success = True + if not parsed_patterns: + log.warning(f"The pattern {jsonpath} is not found in {component}") + if fail_if_pattern_not_found_in_component_validation: + validation_success = False + if len(parsed_patterns) > 0: + for parsed_pattern in parsed_patterns: + if not re.match(regex, parsed_pattern.value): + log.error( + f"The parsed pattern {parsed_pattern} in {component} doesn't match the regular expression {regex}" + ) + validation_success = False + return validation_success + + +if __name__ == "__main__": + Prepare().run() diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/2.yaml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/2.yaml new file mode 100644 index 000000000000..43d5612fb27c --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/2.yaml @@ -0,0 +1,3 @@ +config: + field: + default: "**/spec.yaml" diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/3.yaml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/3.yaml new file mode 100644 index 000000000000..d99479a26589 --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/3.yaml @@ -0,0 +1,5 @@ +fields: + - name: field1 + default: "**/spec.yaml" + - name: field2 + default: "**/spec.yaml" diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/4.yaml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/4.yaml new file mode 100644 index 000000000000..f06733655efa --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/4.yaml @@ -0,0 +1,6 @@ +application: + settings: + field: + default: "**/spec.yaml" + anotherField: + default: "path/to/another/spec.yaml" diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json new file mode 100644 index 000000000000..ddf460aa5eea --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json @@ -0,0 +1 @@ +{"HashAlgorithm": "SHA256", "CatalogItems": {"2.yaml": "639293FA87F59DD6DA3FA42C77F9F76709505046AB4C9BCECF28852628E37928", "3.yaml": "4EEFC75625C82B0CB8583579D489DA852D71CBF0576CEFDC052827139C5ADB6E", "4.yaml": "66AFAAE719F59CE5A94FB259BA173B872F594C3DF00CBB9BBF2A762DE155A69C", "spec.yaml": "42387F635D6CCE6B39ED17838EC8E0023ED1D071623115BC8CDF219B4EA99A7A"}} \ No newline at end of file diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json.sig b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json.sig new file mode 100644 index 000000000000..ddf460aa5eea --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json.sig @@ -0,0 +1 @@ +{"HashAlgorithm": "SHA256", "CatalogItems": {"2.yaml": "639293FA87F59DD6DA3FA42C77F9F76709505046AB4C9BCECF28852628E37928", "3.yaml": "4EEFC75625C82B0CB8583579D489DA852D71CBF0576CEFDC052827139C5ADB6E", "4.yaml": "66AFAAE719F59CE5A94FB259BA173B872F594C3DF00CBB9BBF2A762DE155A69C", "spec.yaml": "42387F635D6CCE6B39ED17838EC8E0023ED1D071623115BC8CDF219B4EA99A7A"}} \ No newline at end of file diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/spec.yaml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/spec.yaml new file mode 100644 index 000000000000..993205063017 --- /dev/null +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/spec.yaml @@ -0,0 +1,2 @@ +field: + default: "**/spec.yaml" From a1d670cb753c48288edb13b8677a27232a23eeb7 Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Fri, 13 Dec 2024 00:09:17 +0530 Subject: [PATCH 11/14] YAML signing --- .../azure/ai/ml/YAMLsigning/command_line.py | 20 +---- .../azure/ai/ml/YAMLsigning/config.yml | 35 -------- .../azure/ai/ml/YAMLsigning/utils.py | 56 ------------ .../azure/ai/ml/YAMLsigning/yamlSign.py | 86 ++----------------- .../ai/ml/YAMLsigning/yamlSignTest/2.yaml | 3 - .../ai/ml/YAMLsigning/yamlSignTest/3.yaml | 5 -- .../ai/ml/YAMLsigning/yamlSignTest/4.yaml | 6 -- .../ml/YAMLsigning/yamlSignTest/catalog.json | 1 - .../YAMLsigning/yamlSignTest/catalog.json.sig | 1 - .../ai/ml/YAMLsigning/yamlSignTest/spec.yaml | 2 - 10 files changed, 6 insertions(+), 209 deletions(-) delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/config.yml delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/2.yaml delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/3.yaml delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/4.yaml delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json.sig delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/spec.yaml diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py index 9f6b00ab2607..8857e313e400 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py @@ -72,11 +72,8 @@ def attach_workspace(self, workspace_id: str = None) -> None: # type: ignore success = self.execute_azure_cli_command( f"account set --subscription {subscription_id}" ) - dir = "C:\Projects\\azure-sdk-for-python\sdk\ml\\azure-ai-ml\\azure\\ai\ml\YAMLsigning" - print(working_direcotry, dir) success = success and self.execute_azure_cli_command( - f"ml data create --name dataSource --path {dir} --type uri_folder -w {workspace} -g {resource_group}" - # f"ml folder attach --workspace-name {workspace} --resource-group {resource_group} --debug" # TODO: command modified for v2 + f"ml data create --name dataSource --path {working_direcotry} --type uri_folder -w {workspace} -g {resource_group}" ) if not success: self.register_error(f"Error!! Failed to attach to {workspace_id}!") @@ -116,7 +113,6 @@ def ensure_component_cli_installed(self) -> bool: f"installing component CLI version {self.config.component_cli_version}." ) cli_install_command = f"extension add --name ml" - # cli_install_command = f"extension add --source https://azuremlsdktestpypi.blob.core.windows.net/wheels/componentsdk/azure_cli_ml-{self.config.component_cli_version}-py3-none-any.whl --pip-extra-index-urls https://azuremlsdktestpypi.azureedge.net/componentsdk/{self.config.component_cli_version} --yes" # TODO: command modified for v2 if self.config.verbose: cli_install_command += " --verbose" @@ -302,20 +298,6 @@ def register_error(self, error: str) -> None: log.error(error) self._errors.append(error) - # def telemetry_logging(self, command: str) -> None: - # """ - # Log the telemetry information in the Azure Application Insights - # """ - # telemetry_logger = TelemetryLogger( - # enable_telemetry=not self.config.disable_telemetry - # ) - # telemetry_logger.log_trace( - # message=f"shrike.build=={__version__}: {command}", - # properties={ - # "custom_dimensions": {"configuration": str(asdict(self.config))} - # }, - # ) - def run(self) -> None: """ Call this to load the configuration object, initialize the logging tree, diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/config.yml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/config.yml deleted file mode 100644 index 8883f0ed053c..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/config.yml +++ /dev/null @@ -1,35 +0,0 @@ -# signing_mode: aml -# activation_method: all -# compliant_branch: ^refs/heads/main$ -# component_specification_glob: 'steps/**/module_spec.yaml' -# log_format: '[%(name)s] [%(levelname)s] - %(message)s' -# workspaces: -# - /subscriptions/2d385bf4-0756-4a76-aa95-28bf9ed3b625/resourceGroups/sdkv2-20240925-rg/providers/Microsoft.MachineLearningServices/workspaces/sdkv2-20240925-ws -# fail_if_version_exists: False -# use_build_number: False -# APPLICATIONINSIGHTS_INSTRUMENTATIONKEY: 'fb916da6-f377-4116-b81b-b103b1e4fb26' - -# Define environment variables -# environment: -# BUILD_SOURCEBRANCH: $(Build.SourceBranch) -# BUILD_BUILDID: $(Build.BuildId) -# BUILD_DEFINITIONNAME: $(Build.DefinitionName) - - -activation_method: all -compliant_branch: ^refs/heads/develop$ -component_specification_glob: '**/*.yaml' -log_format: '[%(name)s][%(levelname)s] - %(message)s' -signing_mode: aml -workspaces: - - /subscriptions/2d385bf4-0756-4a76-aa95-28bf9ed3b625/resourceGroups/sdkv2-20240925-rg/providers/Microsoft.MachineLearningServices/workspaces/sdkv2-20240925-ws -allow_duplicate_versions: True -use_build_number: False -working_directory: 'C:\Projects\azure-sdk-for-python\sdk\ml\azure-ai-ml\azure\ai\ml\YAMLsigning\yamlSignTest' -suppress_adding_repo_pr_tags: True -# strict component validation -enable_component_validation: True -component_validation: - '$.name': '^office.smartcompose.[A-Za-z0-9-_.]+$' - '$.environment.docker.image': '^$|^polymerprod.azurecr.io*$' - '$.inputs..description': '^[A-Z].*' \ No newline at end of file diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py index a0b1e7445e25..1c1bbcddaddf 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py @@ -71,59 +71,3 @@ def delete_two_catalog_files(path): if os.path.exists(file_path_json_sig): log.warning(f"{file_path_json_sig} already exists. Deleting it") os.remove(file_path_json_sig) - - -# class TelemetryLogger: -# """Utils class for opencensus azure monitor""" - -# def __init__( -# self, enable_telemetry=True, instrumentation_key=None, level=logging.INFO -# ): -# self.logger = logging.getLogger("telemetry_logger") -# self.logger.setLevel(level) -# self.enable_telemetry = enable_telemetry -# # Why is it okay to include this key directly in the source code? -# # For any client-side tool, there is a fundamental problem with protecting instrumentation -# # keys. You want the published tool to be able to collect telemetry, but the only way -# # it can do this is if it has some kind of instrumentation key. -# # -# # For an authoritative example, the dotnet CLI contains their telemetry key in a -# # public GitHub repository: -# # https://github.com/dotnet/cli/blob/master/src/dotnet/Telemetry/Telemetry.cs -# # -# # The underlying Azure resource is called `aml1p-ml-tooling`. -# self.instrumentation_key = ( -# "aaefce9e-d109-4fac-bb9f-8277c68e91ac" -# if instrumentation_key is None -# else instrumentation_key -# ) -# handler = AzureLogHandler( -# connection_string=f"InstrumentationKey={self.instrumentation_key}" -# ) -# handler.add_telemetry_processor(self.scrubber_function) -# self.logger.addHandler(handler) - -# def log_trace(self, message, properties={}, level=logging.INFO): -# if self.enable_telemetry: -# try: -# if level == logging.INFO: -# self.logger.info(message, extra=properties) -# elif level == logging.WARNING: -# self.logger.warning(message, extra=properties) -# elif level == logging.ERROR: -# self.logger.error(message, extra=properties) -# elif level == logging.CRITICAL: -# self.logger.critical(message, extra=properties) -# else: -# log.error("The logging level is not expected!") -# except Exception as ex: -# log.warning("Send telemetry exception: %s", str(ex)) -# else: -# log.info( -# "Sending trace log messages to application insight has been disabled." -# ) - -# # Callback function to scrub some columns -# def scrubber_function(self, envelope): -# envelope.tags['ai.cloud.roleInstance'] = 'cloud_RoleInstance_Scrubbed' -# envelope.tags["ai.location.ip"] = "IP_Scrubbed" diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py index 6c247885e6c4..01001f4ff5ad 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py @@ -93,68 +93,15 @@ def all_files_in_snapshot(self, manifest: str) -> List[str]: rv.append(normalized_path) return rv - - def process_all_components(self, files: List[str], action_type) -> List[str]: # type: ignore - """ - Depending on the 'action_type' ('build' or 'validate'), run - build_each_components() or validate_each_component() - in parallel with nb_cores threads. - Returns the list of "built" component files, or an empty list - in the validate case. - """ - rv = [] - if files: - nb_cores = self.nb_cores - - if nb_cores == 1: - for component in files: - if action_type == ActionType.BUILD: - rv.append(self.build_each_components(component)) - elif action_type == ActionType.VALIDATE: - self.validate_each_components(component) - else: - log.error( - f"Unknown value for action_type: '{action_type}'. It should be either '{ActionType.BUILD}' or '{ActionType.VALIDATE}'" - ) - else: - log.info( - f"Batch component processing with {nb_cores} threads. Action: '{action_type}'" - ) - pool = multiprocessing.Pool(processes=nb_cores) - if action_type == ActionType.BUILD: - rv = pool.map(self.build_each_components, files) - elif action_type == ActionType.VALIDATE: - pool.map(self.validate_each_components, files) - pool.close() - pool.join() - return rv # type: ignore - - def build_each_components(self, component) -> List[str]: - """ - For one of component specification file, run `az ml component build`, - and register the status (+ register error if build failed). - """ - path = Path(component) - rv = str(path.parent / ".build" / path.name) - build_component_success = self.execute_azure_cli_command( - f"ml component build --file {component}" # TODO - ) - if build_component_success: - log.info(f"Component {component} is built.") - else: - self.register_error(f"Error when building component {component}.") - return rv # type: ignore - + def find_component_specification_files_using_all(self, dir=None) -> List[str]: """ Find all component specification files in the configured working directory matching the configured glob. Return the absolute paths of these files in the format of a list of string. """ - import pdb; pdb.set_trace(); if dir is None: - # dir = self.config.working_directory - dir = 'C:\Projects\\azure-sdk-for-python\sdk\ml\\azure-ai-ml\\azure\\ai\ml\YAMLsigning\yamlSignTest' + dir = self.config.working_directory all_spec_yaml_files_absolute_paths = [ str(p.absolute()) for p in Path(dir).glob(self.config.component_specification_glob) @@ -852,39 +799,16 @@ def create_catalog_files(self, files: List[str]) -> None: def run_with_config(self): log.info("Running component preparation logic.") - # self.telemetry_logging(command="prepare") component_files = self.find_component_specification_files() - # if not self.config.suppress_adding_repo_pr_tags: # TODO - # try: - # component_files = self.add_repo_and_last_pr_to_tags(component_files) - # except StopIteration: - # log.warning( - # "`add_repo_and_last_pr_to_tags` not successful. Please make sure your component files are in Git. Otherwise, please set `suppress_adding_repo_pr_tags` to True." - # )# TODO - - import pdb;pdb.set_trace() - # section - # print("run catalog creation", component_files) - # self.create_catalog_files(component_files) - # over - self.ensure_component_cli_installed() self.attach_workspace() - self.process_all_components(component_files, ActionType.VALIDATE) - built_component_files = self.process_all_components( - component_files, ActionType.BUILD - ) - - # only call self.create_catalog_files if built_component_files not None - if built_component_files: - self.create_catalog_files(built_component_files) - + + self.create_catalog_files(component_files) + self._create_dependencies_files(component_files) - # self.add_conda_to_system_path() # TODO - def validate_each_components(self, component) -> None: """ For one of component specification file, run `az ml component validate`, diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/2.yaml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/2.yaml deleted file mode 100644 index 43d5612fb27c..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/2.yaml +++ /dev/null @@ -1,3 +0,0 @@ -config: - field: - default: "**/spec.yaml" diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/3.yaml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/3.yaml deleted file mode 100644 index d99479a26589..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/3.yaml +++ /dev/null @@ -1,5 +0,0 @@ -fields: - - name: field1 - default: "**/spec.yaml" - - name: field2 - default: "**/spec.yaml" diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/4.yaml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/4.yaml deleted file mode 100644 index f06733655efa..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/4.yaml +++ /dev/null @@ -1,6 +0,0 @@ -application: - settings: - field: - default: "**/spec.yaml" - anotherField: - default: "path/to/another/spec.yaml" diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json deleted file mode 100644 index ddf460aa5eea..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json +++ /dev/null @@ -1 +0,0 @@ -{"HashAlgorithm": "SHA256", "CatalogItems": {"2.yaml": "639293FA87F59DD6DA3FA42C77F9F76709505046AB4C9BCECF28852628E37928", "3.yaml": "4EEFC75625C82B0CB8583579D489DA852D71CBF0576CEFDC052827139C5ADB6E", "4.yaml": "66AFAAE719F59CE5A94FB259BA173B872F594C3DF00CBB9BBF2A762DE155A69C", "spec.yaml": "42387F635D6CCE6B39ED17838EC8E0023ED1D071623115BC8CDF219B4EA99A7A"}} \ No newline at end of file diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json.sig b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json.sig deleted file mode 100644 index ddf460aa5eea..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/catalog.json.sig +++ /dev/null @@ -1 +0,0 @@ -{"HashAlgorithm": "SHA256", "CatalogItems": {"2.yaml": "639293FA87F59DD6DA3FA42C77F9F76709505046AB4C9BCECF28852628E37928", "3.yaml": "4EEFC75625C82B0CB8583579D489DA852D71CBF0576CEFDC052827139C5ADB6E", "4.yaml": "66AFAAE719F59CE5A94FB259BA173B872F594C3DF00CBB9BBF2A762DE155A69C", "spec.yaml": "42387F635D6CCE6B39ED17838EC8E0023ED1D071623115BC8CDF219B4EA99A7A"}} \ No newline at end of file diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/spec.yaml b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/spec.yaml deleted file mode 100644 index 993205063017..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSignTest/spec.yaml +++ /dev/null @@ -1,2 +0,0 @@ -field: - default: "**/spec.yaml" From 88b7d7d16b2d9b76cd7f82d064ec30b1966f79f9 Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Fri, 27 Dec 2024 11:25:13 +0530 Subject: [PATCH 12/14] component ops adding prepare for sign --- .../azure/ai/ml/YAMLsigning/command_line.py | 348 ------- .../azure/ai/ml/YAMLsigning/configuration.py | 214 ---- .../azure/ai/ml/YAMLsigning/utils.py | 73 -- .../azure/ai/ml/YAMLsigning/yamlSign.py | 953 ------------------ .../azure/ai/ml/_utils/_asset_utils.py | 18 +- .../ai/ml/operations/_component_operations.py | 36 + sdk/ml/azure-ai-ml/samples/hello.py | 34 + sdk/ml/azure-ai-ml/samples/job.yaml | 20 + .../samples/ml_samples_test_prepForSign.py | 25 + .../unittests/test_workspace_operations.py | 1 - .../test_workspace_operations_base.py | 1 - 11 files changed, 131 insertions(+), 1592 deletions(-) delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/configuration.py delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py delete mode 100644 sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py create mode 100644 sdk/ml/azure-ai-ml/samples/hello.py create mode 100644 sdk/ml/azure-ai-ml/samples/job.yaml create mode 100644 sdk/ml/azure-ai-ml/samples/ml_samples_test_prepForSign.py diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py deleted file mode 100644 index 8857e313e400..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/command_line.py +++ /dev/null @@ -1,348 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from abc import ABC, abstractmethod -import base64 -import logging -from omegaconf import OmegaConf -from pathlib import Path -import subprocess -import sys -import os -from typing import Any, Dict, List, Optional, Tuple, Union -from dataclasses import asdict - -# from shrike import __version__ -from configuration import Configuration, load_configuration -# from utils import TelemetryLogger - -log = logging.getLogger(__name__) - - -class _LogEmphasize: - def __init__(self, line: str): - self.line = line - - def __enter__(self): - log.info(self.line) - - def __exit__(self, exc_type, exc_value, traceback): - log.info(self.line) - - -class Command(ABC): - """ - Commands exposed by this package should subclass this class and implement - the `run_with_config` method. They should be invoked by calling - `Subclass().run()` inside their module's `__main__` logic. - """ - - @abstractmethod - def __init__(self) -> None: - self.config: Configuration = None # type: ignore - self._component_statuses: Dict[str, Dict[str, str]] = {} - self._errors: List[str] = [] - self.nb_cores = 1 - - def attach_workspace(self, workspace_id: str = None) -> None: # type: ignore - """ - Run `az ml folder attach` to the configured workspace ID. Default to the - first configured workspace if none is provided. - """ - # self.config.working_directory = "C:\Projects\\azure-sdk-for-python\sdk\ml\\azure-ai-ml\\azure\\ai\ml\YAMLsigning" - working_direcotry = self.config.working_directory - if workspace_id is None: - if not self.config.workspaces and self.config.registries: - workspace_id = self.config.validation_workspace - if not workspace_id: - self.register_error("No workspaces are configured. If you want to publish to registries only, please specify one workspace string in `validation_workspace` for validating components.") - return - else: - try: - workspace_id = self.config.workspaces[0] - except IndexError: - self.register_error( - f"No workspaces are configured. Please include them in your configuration file and ensure the path to your configuration file is correct relative to the working directory {working_direcotry} using `--configuration-file PATH/TO/CONFIGURATION_FILE`." - ) - return - - (subscription_id, resource_group, workspace) = self.parse_workspace_arm_id( - workspace_id - ) - success = self.execute_azure_cli_command( - f"account set --subscription {subscription_id}" - ) - success = success and self.execute_azure_cli_command( - f"ml data create --name dataSource --path {working_direcotry} --type uri_folder -w {workspace} -g {resource_group}" - ) - if not success: - self.register_error(f"Error!! Failed to attach to {workspace_id}!") - - def display_all_statuses(self) -> None: - """ - Display all component statuses in an easily readable format. - """ - pass - - def emphasize(self, line: str = "#" * 80) -> _LogEmphasize: - """ - Use this to initialize a `with` block for emphasizing any logs inside - that block. - """ - return _LogEmphasize(line) - - def ensure_component_cli_installed(self) -> bool: - """ - Check if the component CLI is installed; - install it if not. - # TODO get cli version as config. - """ - - # Check whether the component CLI is installed - component_cli_exists = self.execute_azure_cli_command( - "extension show -n ml", - stderr_is_failure=False, - log_error=False, - ) - - if component_cli_exists: - log.info("component CLI exists. Skipping installation.") - return True - else: - log.info( - f"installing component CLI version {self.config.component_cli_version}." - ) - cli_install_command = f"extension add --name ml" - if self.config.verbose: - cli_install_command += " --verbose" - - is_installed = self.execute_azure_cli_command( - command=cli_install_command, - # installation may show time to install - stderr_is_failure=False, - ) - - if is_installed: - log.info("component CLI is installed.") - else: - log.error("component CLI installation failed.") - - return is_installed - - def execute_azure_cli_command( - self, - command: str, - working_dir: Optional[str] = None, - stderr_is_failure: bool = True, - fail_if_version_exists: bool = False, - log_error: bool = True, - ) -> bool: - """ - Use this method, NOT `execute_command`, for running Azure CLI commands. - The `command` string should contain everything AFTER the `az`. - - This does NOT use the `azure-cli-core` Python package - ( https://stackoverflow.com/a/55960725 ) because it takes a long time - to install, and does not work in Windows. - - This method is necessary for subtle reasons around the way Azure CLI - exposes commands. The "naive approach" doesn't work. - """ - log.debug(f"Executing: az {command}") - az_command_bytes = bytes(f"az {command}", "utf-16le") - az_command_b64 = base64.b64encode(az_command_bytes).decode("ascii") - pwsh_command = ["pwsh", "-EncodedCommand", az_command_b64] - success = self.execute_command( - pwsh_command, working_dir, stderr_is_failure, fail_if_version_exists, log_error - ) - return success - - def execute_command( - self, - command: List[str], - working_dir: Optional[str] = None, - stderr_is_failure: bool = True, - fail_if_version_exists: bool = False, - log_error: bool = True, - ) -> bool: - """ - Execute the provided shell command using the configured timeout. Working - directory defaults to the configured one. If `stderr_is_failure` is - set to false, stderr from the command will be converted to "vanilla" - logs and will not affect success; - - Logs are NOT streamed realtime - they are "bundled together" after the - command executes or times out. - - Warning: running `az *` naively via this function will not work, since - the Azure CLI is not, by default, discoverable via `subprocess.run`. - """ - if working_dir is None: - working_dir = self.config.working_directory - - if len(command) > 0 and command[0] == "az": - raise ValueError( - "Do not run Azure CLI commands with this function. Use execute_azure_cli_command instead." - ) - - kwargs = {} - - if stderr_is_failure or fail_if_version_exists: - kwargs["stderr"] = subprocess.PIPE - - log.debug(f"Executing {command} in {working_dir}") - - timeout = self.config.shell_command_timeout_in_seconds - - try: - res = subprocess.run( - args=command, - cwd=working_dir, - stdout=subprocess.PIPE, - timeout=timeout, - **kwargs, - ) - - success = res.returncode == 0 - - if not success: - if log_error: - log.error(f"Command failed with exit code {res.returncode}") - else: - log.info(f"Command failed with exit code {res.returncode}") - - stdout = res.stdout - stderr = res.stderr - except subprocess.TimeoutExpired as e: - log.error(f"Command timed out after {timeout} seconds.") - success = False - stdout = e.stdout - stderr = e.stderr - - if stdout: - for line in stdout.splitlines(): - try: - line = str(line, encoding="utf-8", errors="ignore") # type: ignore - except: - log.debug( - "Failed to convert the following stdout line into String (utf-8)" - ) - log.info(line) - if stderr: - for line in stderr.splitlines(): - try: - line = str(line, encoding="utf-8", errors="ignore") # type: ignore - except: - log.debug( - "Failed to convert the following stdout line into String (utf-8)" - ) - if stderr_is_failure: - log.error(line) - success = False - elif fail_if_version_exists and "Error" in line and "already exists in" in line: # type: ignore - log.error(line) - success = False - else: - log.info(line) - - return success - - def normalize_path(self, path: Union[str, Path], directory=False) -> str: - """ - Normalize the provided path (file or directory) to the following format: - - Absolute (not relative) - - Linux-style (forward slash separating directories) - - If `directory=True`, ending in a forward slash. - """ - if isinstance(path, str): - path = Path(path) - - path = str(path.absolute()) - rv = path.replace("\\", "/") - - if directory and not rv[-1] == "/": - rv += "/" - - return rv - - def parse_workspace_arm_id(self, id: str) -> Tuple[str, str, str]: - """ - Parse a workspace ARM ID like - `/subscriptions/48bbc269-ce89-4f6f-9a12-c6f91fcb772d/resourceGroups/aml1p-rg/providers/Microsoft.MachineLearningServices/workspaces/aml1p-ml-wus2` - and return (subscription ID, resource group, workspace name). - """ - split = id.split("/") - subscription = split[2] - resource_group = split[4] - workspace = split[8] - return (subscription, resource_group, workspace) - - def register_component_status( - self, component_name: str, status_name: str, status: str - ) -> None: - """ - Register a status (e.g., build = failed) for a specified component. All - statuses will be displayed in a friendly manner before exiting. - """ - if component_name not in self._component_statuses: - self._component_statuses[component_name] = {} - - status_dict = self._component_statuses[component_name] - status_dict[status_name] = status - - def register_error(self, error: str) -> None: - """ - Register that an error has occured (also, log it). If any errors have - been registered, the `run` method will return with non-zero exit code. - """ - log.error(error) - self._errors.append(error) - - def run(self) -> None: - """ - Call this to load the configuration object, initialize the logging tree, - then invoke the subclass' `run_with_config` method and return the - appropriate exit code. This should be the entrypoint inside a command's - `if __name__ == "__main__"` block. - """ - config = load_configuration() - - log_level = "DEBUG" if config.verbose else "INFO" - logging.basicConfig(level=log_level, format=config.log_format) - - max_nb_cores = max(os.cpu_count() - 1, 1) # type: ignore - if config.number_of_cores_parallel <= 0 or config.number_of_cores_parallel > max_nb_cores: - self.nb_cores = max_nb_cores - else: - self.nb_cores = config.number_of_cores_parallel - - with self.emphasize(): - config_yaml = OmegaConf.to_yaml(config) - log.info("Final configuration being used:\n") - log.info(config_yaml) - - self.config = config - self.run_with_config() - - self.display_all_statuses() - - failed = bool(self._errors) - - if failed: - log.error(f"Encountered {len(self._errors)} errors!") - - sys.exit(bool(self._errors)) - - @abstractmethod - def run_with_config(self) -> None: - """ - Run the subclasses command with the specified configuration object. - Before this method is invoked, there is no guarantee that `self.config` - will be populated; after it is invoked, that is guaranteed. - Implementations of this method should NOT mutate the logging tree in - any way. They should also NOT raise any exceptions; rather they should - call the `register_error` method, which will ensure non-zero exit code. - Implementations can raise specific "status information" (e.g., a - component is not "active") by calling `register_component_status`. - """ - pass diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/configuration.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/configuration.py deleted file mode 100644 index 183e3e061161..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/configuration.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -from dataclasses import asdict, dataclass, field, replace -import logging -import os -from omegaconf.omegaconf import OmegaConf -import sys -from typing import Any, Dict, List -import warnings - - -log = logging.getLogger(__name__) - - -# Freeze single "empty list" so by-reference comparison of default values works. -_EMPTY_LIST = [] - - -@dataclass(frozen=True) -class Configuration: - # TODO: should this be handled via enum? - activation_method: str = field(default="all") - compliant_branch: str = field(default="^refs/heads/main$") - source_branch: str = field(default="") - component_cli_version: str = field(default="0.9.13") - component_specification_glob: str = field(default="**/spec.yaml") - # TODO: consider a way of supporting both this and `*.yaml` as defaults. - configuration_file: str = field(default="aml-build-configuration.yml") - log_format: str = field(default="%(message)s") - # Registration in registries is surprisingly slow. - shell_command_timeout_in_seconds: int = field(default=1000) - number_of_cores_parallel: int = field(default=0) - # TODO: should this be handled via enum? - signing_mode: str = field(default="aml") - verbose: bool = field(default=False) - working_directory: str = field(default_factory=lambda: os.getcwd()) - workspaces: List[str] = field( - default_factory=lambda: _EMPTY_LIST, metadata={"nargs": "*"} - ) - makecat_directory: str = field(default=r"C:\Program Files (x86)\Windows Kits") - makecat_default: str = field(default=r"10\bin\x64\makecat.exe") - # allow_duplicate_versions is on path to deprecation. Please avoid using it - allow_duplicate_versions: bool = field(default=False) - fail_if_version_exists: bool = field(default=False) - use_build_number: bool = field(default=False) - all_component_version: str = field(default="") - disable_telemetry: bool = field(default=False) - suppress_adding_repo_pr_tags: bool = field(default=False) - enable_component_validation: bool = field(default=False) - fail_if_pattern_not_found_in_component_validation: bool = field(default=False) - component_validation: dict = field(default_factory=dict) - dependency_hints: dict = field(default_factory=dict) - registries: List[str] = field( - default_factory=lambda: _EMPTY_LIST, metadata={"nargs": "*"} - ) - detect_changes_in_unzipped_folder: bool = field(default=False) - validation_workspace: str = field(default="") - - -def load_configuration() -> Configuration: - """ - Create configuration object from "implicit" command line arguments and - environment variables. - """ - # Strip away the first argument, which is the name of the file being - # executed. - args = sys.argv[1:] - env = os.environ - rv = load_configuration_from_args_and_env(args, dict(env)) - return rv - - -def load_configuration_from_args(args) -> dict: - """ - Load a "minimal" configuration dictionary from command line arguments. This - strips away any values which are default, so that merging with the default - and file-based configuration objects works properly. - """ - from argparse_dataclass import ArgumentParser - - default_config = Configuration() - parser = ArgumentParser(Configuration) - - cli_config = parser.parse_args(args) - # Strangely, calling `asdict` changes the object reference for the value - # if it is an empty array. - cli_config_vars = asdict(cli_config) - - for key in list(cli_config_vars.keys()): - - # Compare by reference so that you can override with default values like - # the empty list: https://stackoverflow.com/a/14080980. - if getattr(cli_config, key) is getattr(default_config, key): - del cli_config_vars[key] - - return cli_config_vars - - -def load_configuration_from_args_and_env( - args: List[str], env: Dict[str, Any] -) -> Configuration: - """ - Load configuration file from provided command line arguments and environment - variables. - - Priority is documented at https://aka.ms/aml/amlbuild , from lowest to - highest: - - default value - - configuration file - - environment variables - - command line arguments (highest priority) - """ - # Create default config - default_config = Configuration() - - # Load config from command line - cli_config = load_configuration_from_args(args) - - # Load config parameters specified in environment variables - env_config = { - key.lower(): value - for key, value in env.items() - if key.lower() in asdict(default_config).keys() - } - print(f"Load the config in the environment variables: {env_config}") - - # Merge cli config and env config - # Priority: cli > env - if env_config: - print( - "Merge the config in the environment variables with the config in the command line." - ) - cli_config = OmegaConf.merge(env_config, cli_config) - - working_directory = ( - cli_config.get("working_directory") or default_config.working_directory # type: ignore - ) - - cli_config_path = cli_config.get("configuration_file") # type: ignore - file_config = None - if cli_config_path is not None: - try: - print("Loading user provided configuration file") - file_config = OmegaConf.load(cli_config_path) - except FileNotFoundError: - print( - f"***ERROR: the configuration file path provided {cli_config_path} does not exist in your working directory {working_directory}, so both preparation and registration will fail." - ) - elif os.path.isfile(default_config.configuration_file): - print( - "Configuration file does not exist. Loading default configuration file aml-build-configuration.yml.", - ) - file_config = OmegaConf.load(default_config.configuration_file) - else: - warnings.warn( - "User provided/default configuration file does not exist. Using default configuration.", - UserWarning, - ) - - if file_config is None: - print("Configuration file is empty. Using default configuration.") - cli_and_file_config = cli_config - else: - print("Overriding default configuration by configuration file.") - cli_and_file_config = OmegaConf.merge(file_config, cli_config) - - if cli_and_file_config.get("workspaces") is None: # type: ignore - log.error( - "Workspace is not configured. Please update in your configuration file." - ) - - if cli_and_file_config.get("allow_duplicate_versions") is not None: # type: ignore - if cli_and_file_config.get("fail_if_version_exists") is None: # type: ignore - cli_and_file_config.update( - { - "fail_if_version_exists": not cli_and_file_config.get( - "allow_duplicate_versions" # type: ignore - ) - } - ) - warnings.warn( - "We recommend against using the parameter allow_duplicate_versions. Please specify fail_if_version_exists instead.", - UserWarning, - ) - else: - raise ValueError( - "Please don't specify both allow_duplicate_versions and fail_if_version_exists. Check out https://aka.ms/aml/amlbuild for more information." - ) - print("Please refer to https://aka.ms/aml/amlbuild for more information.") - - config = OmegaConf.merge(default_config, cli_and_file_config) - config = Configuration(**config) # type: ignore - - # Load the environment variable of source branch into config - if "BUILD_SOURCEBRANCH" in env.keys(): - config = replace(config, source_branch=env["BUILD_SOURCEBRANCH"]) - else: - warnings.warn("BUILD_SOURCEBRANCH is not in the environment variable list.") - - # Load the environment variable of build number into config, if user_build_number=True - if config.use_build_number: - if "BUILD_BUILDNUMBER" in env.keys(): - if config.all_component_version: - log.warning( - f"The build number {env['BUILD_BUILDNUMBER']} overwrites the value of all_component_version {config.all_component_version}" - ) - config = replace(config, all_component_version=env["BUILD_BUILDNUMBER"]) - else: - raise ValueError( - "BUILD_BUILDNUMBER is not in the environment variable list." - ) - - return config diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py deleted file mode 100644 index 1c1bbcddaddf..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/utils.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -import os -import hashlib -import logging -# from opencensus.ext.azure.log_exporter import AzureLogHandler - -log = logging.getLogger(__name__) - - -def create_catalog_stub(): - """ - Function that creates a json stub of the form: {'HashAlgorithm': 'SHA256', 'CatalogItems': {}}. - """ - json_stub = {} - json_stub["HashAlgorithm"] = "SHA256" - json_stub["CatalogItems"] = {} - return json_stub - - -def create_SHA_256_hash_of_file(file): - """ - Function that returns the SHA 256 hash of 'file'.\n - Logic taken from https://www.quickprogrammingtips.com/python/how-to-calculate-sha256-hash-of-a-file-in-python.html - """ - sha256_hash = hashlib.sha256() - with open(file, "rb") as f: - # Read and update hash string value in blocks of 4K - for byte_block in iter(lambda: f.read(4096), b""): - sha256_hash.update(byte_block) - # Converting to upper case because that's what is required by the policy - # service. See their code: - # https://dev.azure.com/msasg/Bing_and_IPG/_git/Aether?path=/src/aether/platform/backendV2/BlueBox/PolicyService/Microsoft.MachineLearning.PolicyService/Workers/CatalogValidation.cs - return sha256_hash.hexdigest().upper() - - -def add_file_to_catalog(file_for_catalog, catalog, absolute_path_to_remove): - """ - Function that adds an entry for 'file_for_catalog' to the 'catalog'.\n - Specifically, {: } will be added to the "CatalogItems" dictionary of the 'catalog' json, where is computed with the create_SHA_256_hash_of_file() function, and is obtained by removing 'absolute_path_to_remove' from the full 'file_for_catalog' path - """ - hash_of_file = create_SHA_256_hash_of_file(file_for_catalog) - relative_path = file_for_catalog.split(absolute_path_to_remove)[1] - catalog["CatalogItems"][relative_path] = hash_of_file - return catalog - - -def write_two_catalog_files(catalog, path): - """ - Function that writes 'catalog' into 2 duplicate files: "path/config.json" and "path/config.json.sig". - """ - with open(os.path.join(path, "catalog.json"), "w") as jsonFile1: - json.dump(catalog, jsonFile1) - with open(os.path.join(path, "catalog.json.sig"), "w") as jsonFile2: - json.dump(catalog, jsonFile2) - - -def delete_two_catalog_files(path): - """ - Function that deletes the "catalog.json" and "catalog.json.sig" files located at 'path', if they exist - """ - # catalog.json - file_path_json = os.path.join(path, "catalog.json") - if os.path.exists(file_path_json): - log.warning(f"{file_path_json} already exists. Deleting it") - os.remove(file_path_json) - # catalog.json.sig - file_path_json_sig = os.path.join(path, "catalog.json.sig") - if os.path.exists(file_path_json_sig): - log.warning(f"{file_path_json_sig} already exists. Deleting it") - os.remove(file_path_json_sig) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py b/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py deleted file mode 100644 index 01001f4ff5ad..000000000000 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/YAMLsigning/yamlSign.py +++ /dev/null @@ -1,953 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import logging -import os -import sys -import multiprocessing -import collections -import jsonpath_ng -import chardet -import re -from typing import List, Set, Union -import shutil -from ruamel.yaml import YAML -from git import Repo, InvalidGitRepositoryError, NoSuchPathError - -from command_line import Command -from utils import ( - create_catalog_stub, - add_file_to_catalog, - write_two_catalog_files, - delete_two_catalog_files, -) -from pathlib import Path -import yaml -import urllib.parse -import uuid -from urllib.parse import urlparse -from enum import Enum - -log = logging.getLogger(__name__) - -ALLOWED_CONTAINER_REGISTRIES = ["polymerprod.azurecr.io"] -ALLOWED_PACKAGE_FEEDS = [ - "https://o365exchange.pkgs.visualstudio.com/_packaging/PolymerPythonPackages/pypi/simple/" -] - - -class RuntimeEnvironment(Enum): - AZURE_DEVOPS_BUILD = "Azure DevOps Build" - GITHUB_ACTION = "GitHub Action" - OTHER = "Other" - - -class OperatingSystem(Enum): - WINDOWS = "Windows" - LINUX = "Linux" - OTHER = "Other" - - -class TargetType(Enum): - ADDITIONAL_INCLUDES = "additional_includes" - DEPENDENCY_HINTS = "dependency_hints" - - -class ActionType(Enum): - VALIDATE = "validate" - BUILD = "build" - - -class Prepare(Command): - def __init__(self): - super().__init__() - self._component_statuses = {} - - def folder_path(self, file: str) -> str: - """ - Return the normalized path of the directory containing a file. - """ - return self.normalize_path(Path(file).parent, directory=True) - - def all_files_in_snapshot(self, manifest: str) -> List[str]: - """ - Return a list of all normalized files in the snapshot. The input - (`manifest`) is assumed to be some file, whether AML-style component - spec or Aether-style auto-approval manifest, in the "root" of the - snapshot. - """ - folder_path = self.folder_path(manifest) - log.info("Absolute path for current component is: " + folder_path) - - # Generate a list of all files in this components folder (including subdirectories) - rv = [] - - # Make sure we pick up Linux-style "hidden" files like .amlignore and - # hidden "directories", as well as hidden files in hidden directories. - # https://stackoverflow.com/a/65205404 - # https://stackoverflow.com/a/41447012 - for root, _, file_paths in os.walk(folder_path): - for file in file_paths: - file_path = os.path.join(root, file) - normalized_path = self.normalize_path(file_path) - rv.append(normalized_path) - - return rv - - def find_component_specification_files_using_all(self, dir=None) -> List[str]: - """ - Find all component specification files in the configured working - directory matching the configured glob. Return the absolute paths - of these files in the format of a list of string. - """ - if dir is None: - dir = self.config.working_directory - all_spec_yaml_files_absolute_paths = [ - str(p.absolute()) - for p in Path(dir).glob(self.config.component_specification_glob) - ] - - return all_spec_yaml_files_absolute_paths - - def find_component_specification_files_using_smart(self) -> List[str]: - """ - This function returns the list of components (as a list of absolute paths) potentially affected by the latest commit. - """ - log.info( - "Determining which components are potentially affected by the current change." - ) - [repo, current_branch, compliant_branch] = self.identify_repo_and_branches() - modified_files = self.get_modified_files(repo, current_branch, compliant_branch) - active_components = self.infer_active_components_from_modified_files( - modified_files - ) - return active_components - - def identify_repo_and_branches(self): - """ - This function returns the current repository, along with the name of the current and compliant branches [repo, current_branch, compliant_branch]. Throws if no repo can be found. - """ - # identify the repository - curr_path = Path(self.config.working_directory).resolve() - try: - repo = Repo(curr_path, search_parent_directories=True) - log.info("Found a valid repository in " + repo.git_dir) - except (InvalidGitRepositoryError, NoSuchPathError): - message = ( - str(curr_path) - + " or its parents do not contain a valid repo path or cannot be accessed." - ) - raise Exception(message) - try: - current_branch = str( - repo.head.ref - ) # when running from our build the repo head is detached so this will throw an exception - except TypeError: - current_branch = os.environ.get("BUILD_SOURCEBRANCH") or os.environ.get( - "GITHUB_REF" - ) - log.info("The current branch is: '" + str(current_branch) + "'.") - # Identify the compliant branch - if not (self.config.compliant_branch.startswith("^refs/heads/")) or not ( - self.config.compliant_branch.endswith("$") - ): - raise Exception( - "The name of the compliant branch found in the config file should start with '^refs/heads/' and end with '$'. Currently it is: '" - + self.config.compliant_branch - + "'." - ) - else: - compliant_branch = self.config.compliant_branch.replace("^refs/heads/", "")[ - 0:-1 - ] - log.info("The compliant branch is: '" + compliant_branch + "'.") - return [repo, current_branch, compliant_branch] - - def get_modified_files(self, repo, current_branch, compliant_branch) -> Set[str]: - """ - This function returns the paths of files that have been modified. 3 scenarios are supported.\n - 1/ 'Build - before Merge'; when the 'prepare' command is run as part of a build, but before the actual merge (in this case, the name of the current branch starts with 'refs/pull/' - this is the default Azure DevOps behavior).\n - 2/ 'Build - after Merge'; when the 'prepare' command is run as part of a build, after the actual merge (in this case, the name of the current branch is the same as the name of the compliant branch).\n - 3/ 'Manual'; when the prepare command is run manually (typically before publishing the PR). - """ - # identify the 2 relevant commits based on the use case - current_commit, previous_commit = self.get_relevant_commits( - repo, current_branch, compliant_branch - ) - - # take the actual diff - diff = self.get_diff_between_commits(current_commit, previous_commit) - - # process the diff object to obtain a list of paths - res = self.extract_paths_from_diff( - diff, - repo_working_dir=repo.working_dir, - repo_working_tree_dir=repo.working_tree_dir, - repo_git_dir=repo.git_dir, - ) - return res - - def extract_paths_from_diff( - self, diff, repo_working_dir, repo_working_tree_dir, repo_git_dir - ): - """Function that extracts the paths of the modified files from the diff between 2 commits.""" - res = set() - # let's build a set with the paths of modified files found in the diff object - log.debug("Working directory: " + self.config.working_directory) - log.debug("repo.working_dir: " + repo_working_dir) - log.debug("repo.working_tree_dir: " + repo_working_tree_dir) - log.debug("repo.git_dir: " + repo_git_dir) - for d in diff: - log.debug("d.a_path: " + d.a_path) - log.debug("Path(d.a_path).absolute(): " + str(Path(d.a_path).absolute())) - log.debug("Path(d.a_path).resolve(): " + str(Path(d.a_path).resolve())) - r_a = str(Path(repo_git_dir).parent / Path(d.a_path)) - res.add(r_a) - r_b = str(Path(repo_git_dir).parent / Path(d.b_path)) - res.add(r_b) - log.info("The list of modified files is:") - log.info(res) - return res - - def get_relevant_commits(self, repo, current_branch, compliant_branch): - """ - This function returns the commits required to compute the list of files that have been modified. 3 scenarios are supported.\n - 1/ 'Build - before Merge'; when the 'prepare' command is run as part of a build, but before the actual merge (in this case, the name of the current branch starts with 'refs/pull/' - this is the default Azure DevOps behavior).\n - 2/ 'Build - after Merge'; when the 'prepare' command is run as part of a build, after the actual merge (in this case, the name of the current branch is the same as the name of the compliant branch).\n - 3/ 'Manual'; when the prepare command is run manually (typically before publishing the PR). - """ - # Grab the diff differently depending on the scenario - if current_branch.replace("refs/heads/", "") == compliant_branch: - # 'Build - after Merge' case: we will take the diff between the - # tree of the latest commit to the compliant branch, and the tree - # of the previous commit to the compliant branch corresponding to a - # PR (we assume the commit summary starts with 'Merged PR') - log.info( - "We are in the 'Build - after Merge' case (the current branch is the compliant branch)." - ) - current_commit = self.get_compliant_commit_corresponding_to_pull_request( - repo, compliant_branch - ) - self.log_commit_info(current_commit, "Current commit to compliant branch") - previous_commit = ( - self.get_previous_compliant_commit_corresponding_to_pull_request( - current_commit, - consider_current_commit=False, - ) - ) - self.log_commit_info( - previous_commit, "Previous PR commit to compliant branch" - ) - elif current_branch.startswith("refs/pull/"): - # 'Build - before Merge': we will take the diff between the tree of - # the current commit, and the tree of the previous commit to the - # compliant branch corresponding to a PR (we assume the commit - # summary starts with 'Merged PR') - log.info( - "We are in the 'Build - before Merge' case (the current branch is not the compliant branch and its name starts with 'refs/pull/')." - ) - current_commit = repo.commit() - self.log_commit_info(current_commit, "Current commit to current branch") - latest_commit_to_compliant_branch = repo.remotes.origin.refs[ - compliant_branch - ].commit - previous_commit = ( - self.get_previous_compliant_commit_corresponding_to_pull_request( - latest_commit_to_compliant_branch, - consider_current_commit=True, - ) - ) - self.log_commit_info( - previous_commit, "Previous PR commit to compliant branch" - ) - else: - # 'Manual' Case: we will take the diff between the current branch - # and the compliant branch (we're assuming the compliant branch is - # locally up to date here) - log.info( - "We are in the 'Manual' case (the current branch is NOT the compliant branch and its name does not start with 'refs/pull/')." - ) - try: - current_commit = repo.heads[ - current_branch - ].commit # this won't work when running the Manual case from the DevOps portal, but the below will - except (IndexError, AttributeError): - current_commit = repo.commit() - self.log_commit_info(current_commit, "Current commit to current branch") - try: - previous_commit = repo.heads[ - compliant_branch - ].commit # this won't work when running the Manual case from the DevOps portal, but the below will - except (IndexError, AttributeError): - latest_commit_to_compliant_branch = repo.remotes.origin.refs[ - compliant_branch - ].commit - previous_commit = ( - self.get_previous_compliant_commit_corresponding_to_pull_request( - latest_commit_to_compliant_branch, - consider_current_commit=True, - ) - ) - self.log_commit_info(previous_commit, "Previous commit to compliant branch") - - return current_commit, previous_commit - - def get_diff_between_commits(self, current_commit, previous_commit): - """Function that gets the diff between 2 commits.""" - # just use the 'diff' function from gitpython - return current_commit.tree.diff(previous_commit.tree) - - def log_commit_info(self, commit, title) -> None: - log.info(title + ":") - log.info("Summary: " + commit.summary) - log.info("Author: " + str(commit.author)) - log.info("Authored Date: " + str(commit.authored_date)) - - def get_previous_compliant_commit_corresponding_to_pull_request( - self, latest_commit, consider_current_commit - ): - """ - This function will return the previous commit in the repo corresponding to a PR (i.e. that starts with "Merged PR"). - If `consider_current_commit` is set to True, the `latest_commit` will be considered. If set to false, only previous commits will be considered. - """ - target_string = "Merged PR" - if consider_current_commit and latest_commit.summary.startswith(target_string): - return latest_commit - previous_commit = latest_commit - for c in previous_commit.iter_parents(): - if c.summary.startswith(target_string): - previous_commit = c - break - return previous_commit - - def get_compliant_commit_corresponding_to_pull_request( - self, repo, compliant_branch - ): - """ - This function will return the most recent commit in the repo that truly corresponds to the triggered build. It is identified thanks to the 'Build.SourceVersionMessage' DevOps environment variable (see https://docs.microsoft.com/en-us/azure/devops/pipelines/build/variables?view=azure-devops&tabs=yaml) that contains the true commit message. This is used to address the race condition occurring when a commit sneaks in before the "prepare" step was run on the previous commit. - """ - # this is the true commit message corresponding to the PR that triggered the build - true_commit_message = self.get_true_commit_message() - # this is the most recent commit - current_commit = repo.remotes.origin.refs[compliant_branch].commit - # if the most recent commit corresponds to the true commit message, then return it - if true_commit_message.startswith(current_commit.summary): - return current_commit - # otherwise, let's iterate through the parents until we find it - candidate_commit = current_commit - for c in candidate_commit.iter_parents(): - if true_commit_message.startswith(c.summary): - return c - # if the corresponding commit cannot be found, return the most recent one and log a warning - log.warning( - "Could not find in the git repo the commit that triggered this PR. Returning the most recent but beware, the 'smart' mode likely will not work properly." - ) - return current_commit - - def get_true_commit_message(self): - return str(os.environ.get("BUILD_SOURCEVERSIONMESSAGE") or "NA") - - def infer_active_components_from_modified_files(self, modified_files) -> List[str]: - """ - This function returns the list of components (as a list of directories paths) potentially affected by changes in the `modified_files`. - """ - rv = [] - # We will go over components one by one - all_components_in_repo = self.find_component_specification_files_using_all() - log.info("List of all components in repo:") - log.info(all_components_in_repo) - for component in all_components_in_repo: - if self.component_is_active(component, modified_files): - rv.append(component) - # No need to dedup rv since we are only considering components once - log.info("The active components are:") - log.info(rv) - return rv - - def component_is_active(self, component, modified_files) -> bool: - """ - This function returns True if any of the 'modified_files' potentially affects the 'component' (i.e. if it is directly in one of the 'component' subfolders, or if it is covered by the additional_includes files). If the component has been deleted, returns False. - """ - log.info("Assessing whether component '" + component + "' is active...") - # Let's first take care of the case where the component has been deleted - if not (Path(component).exists()): - return False - # Let's grab the contents of the additional_includes file if it exists. - component_additional_includes_contents = self.get_target_file_contents( - component, - TargetType.ADDITIONAL_INCLUDES, - ) - # Let's grab the contents of the additional_includes file if it exists. - component_dependency_hints_contents = self.get_target_file_contents( - component, - TargetType.DEPENDENCY_HINTS, - ) - # loop over all modified files; if current file is in subfolder of component or covered by - # additional includes or dependency hints, return True - for modified_file in modified_files: - if ( - self.is_in_subfolder(modified_file, component) - or self.is_in_target_list( - modified_file, - TargetType.ADDITIONAL_INCLUDES, - component_additional_includes_contents, - ) - or self.is_in_target_list( - modified_file, - TargetType.DEPENDENCY_HINTS, - component_dependency_hints_contents, - ) - ): - return True - return False - - def get_target_file_contents( - self, component, target_type - ) -> Union[List[str], None]: - component_target_file_contents = None - # for depependency hints, we look globally first - if target_type == TargetType.DEPENDENCY_HINTS: - component_target_file_contents = self.get_global_dependency_hints_contents( - component - ) - # First we figure out the expected path of the additional_includes file - component_target_file_path = self.get_theoretical_target_file_path( - component, target_type - ) - # And we load it if it exists. - if Path(component_target_file_path).exists(): - rbfile = open(component_target_file_path, "rb").read() - if chardet.detect(rbfile).get("encoding").lower() not in ["utf-8", "ascii"]: #type: ignore - raise ValueError( - f"Encoding of a file: '{{spec_file_name}}.{target_type.value}' not supported, use UTF-8." - ) - - with open(component_target_file_path, "r") as component_target_file: - if (target_type == TargetType.DEPENDENCY_HINTS) and ( - component_target_file_contents is not None - ): - component_target_file_contents += component_target_file.readlines() - else: - component_target_file_contents = component_target_file.readlines() - else: - # If additional_includes doesn't exist we log a message explaining the expected name format - if target_type == TargetType.ADDITIONAL_INCLUDES: - log.info( - "No additional_includes file could be found for the component '" - + component - + "'. If you tried to create such a file, remember it should live next to the component spec file and should be named '{spec_file_name}.additional_includes'. " - + "For example, if the component spec file is named 'component_spec.yaml', the additional_includes file should be named 'component_spec.additional_includes'. In this specific case, the expected additional_includes file name is: '" - + component_target_file_path - + "'." - ) - # Then we check whether there is an improperly named additional_includes file in the component folder, and if so we throw - if self.check_for_wrongly_named_additional_includes(component): - raise ValueError( - "An additional_includes file which does not respect the naming pattern was found. Please rename this file. Remember it should live next to the component spec file and should be named '{spec_file_name}.additional_includes'." - + "For example, if the component spec file is named 'component_spec.yaml', the additional_includes file should be named 'component_spec.additional_includes'." - ) - # Before returning, we make the paths in the additional_includes file absolute - if component_target_file_contents: - for line_number in range(0, len(component_target_file_contents)): - component_target_file_contents[line_number] = str( - Path( - os.path.join( - Path(component).parent, - component_target_file_contents[line_number].rstrip("\n"), - ) - ).resolve() - ) - return component_target_file_contents - - def get_global_dependency_hints_contents(self, component) -> Union[List[str], None]: - if len(self.config.dependency_hints) > 0: - global_dependency_hints_contents = [] - for ( - component_folder_paths, - dependency_hints_paths, - ) in self.config.dependency_hints.items(): - component_folder_absolute_paths = [ - str(p.absolute().resolve()) - for p in Path(self.config.working_directory).glob( - component_folder_paths - ) - ] - if ( - str(Path(component).parent.resolve()) - in component_folder_absolute_paths - ): - if not isinstance(dependency_hints_paths, list): - dependency_hints_paths = [dependency_hints_paths] - for dependency_hints_path in list(dependency_hints_paths): - global_dependency_hints_contents += [ - str(p.absolute().resolve()) - for p in Path(self.config.working_directory).glob( - dependency_hints_path - ) - ] - if len(global_dependency_hints_contents) > 0: - return global_dependency_hints_contents - else: - return None - else: - return None - - def get_theoretical_target_file_path(self, component, target_type) -> str: - """ - Returns the expected path of the 'target_type' file - associated with the 'component'. - """ - # First, we figure out the name of the target file, based on the component name - component_name_without_extension = Path(component).name.split(".yaml")[0] - # Then, we construct the expected path of the target file (see - # https://componentsdk.azurewebsites.net/components/component-spec-topics/additional-includes.html - # for the 'additional_includes' case) - component_target_file_path = os.path.join( - Path(component).parent, - component_name_without_extension + "." + str(target_type.value), - ) - return component_target_file_path - - def check_for_wrongly_named_additional_includes(self, component) -> bool: - """ - Returns True if the component folder contains an improperly named additional_includes file - i.e. a lonely additional_includes file without a corresponding spec.yaml - """ - # grab all potential additional_includes files in the component folder - potentially_wrongly_named_files = Path(component).parent.glob( - "*.additional_includes*" - ) - for potentially_wrongly_named_file in potentially_wrongly_named_files: - # determine the expected location of the spec - theoretical_component_path = ( - str(potentially_wrongly_named_file)[:-20] + ".yaml" - ) - # check if spec exists - if os.path.isfile(theoretical_component_path): - continue - else: - # if not, we have a problem - self.register_error( - f"Component folder {component} contains a lonely additional includes file at {potentially_wrongly_named_file}, missing component spec {theoretical_component_path}" - ) - return True - return False - - def is_in_subfolder(self, modified_file, component) -> bool: - """ - This function returns True if 'modified_file' is in a subfolder of 'component' ('component' can be either the path to a file, or a directory). If the component has been deleted, returns False. - """ - # Let's first take care of the case where the component has been deleted - if not (Path(component).exists()): - log.debug("'" + component + "' does not exist, returning False.") - return False - # Case where the component has not been deleted - for parent in Path(modified_file).parents: - if parent.exists(): - if Path(component).is_dir(): - if parent.samefile(Path(component)): - log.info( - "'" - + modified_file - + " is in a subfolder of '" - + component - + "'." - ) - return True - else: - if parent.samefile(Path(component).parent): - log.info( - "'" - + modified_file - + " is in a subfolder of '" - + component - + "'." - ) - return True - log.debug( - "'" + modified_file + " is NOT in a subfolder of '" + component + "'." - ) - return False - - def is_in_target_list( - self, modified_file, target_type, target_list_contents - ) -> bool: - """ - This function returns True if 'modified_file' is covered by the file - 'target_list_contents'. The 'target_type' can be either - additional_includes or dependency_hints - """ - # first tackle the trivial case of no target file - if target_list_contents is None: - log.debug( - f"The component's target file ({target_type}) is empty, returning False." - ) - return False - # now the regular scenario - for line in target_list_contents: - # when the line from the target list is a file, we directly check its path against that of modified_file - if Path(line).is_file(): - if str(Path(modified_file).resolve()) == str( - Path(line).resolve() - ): # can't use 'samefile' here because modified_file is not guaranteed to exist, we resolve the path and do basic == test - log.info( - "'" - + modified_file - + f" is directly listed in the {target_type} file." - ) - return True - # slightly more complicated case: when the line - # in the target list is a directory, we can just - # call the is_in_subfolder function - # but first, we take care of the zipped folders - if target_type == TargetType.ADDITIONAL_INCLUDES: - if self.config.detect_changes_in_unzipped_folder: - split_line = os.path.splitext(line) - if split_line[1] == ".zip": - line = split_line[0] - if Path(line).is_dir(): - if self.is_in_subfolder(modified_file, line): - log.info( - "'" - + modified_file - + f" is in one of the directories listed in the {target_type} file." - ) - return True - log.debug( - "'" - + modified_file - + f" is NOT referenced by the {target_type} file (neither directly nor indirectly)." - ) - return False - - - def find_component_specification_files(self) -> List[str]: - """ - Find the list of "active" component specification files using the - configured method ("all" or "smart"). - """ - activation_method = self.config.activation_method - - if activation_method == "all": - rv = self.find_component_specification_files_using_all() - elif activation_method == "smart": - rv = self.find_component_specification_files_using_smart() - else: - raise ValueError( - f"Invalid activation_method provided: '{activation_method}'" - ) - - return rv - - def _create_dependencies_files(self, component_files) -> str: - id = str(uuid.uuid4()) - path_to_dependencies_files = os.path.join( - self.config.working_directory, "component_dependencies_" + id - ) - log.info( - f"Writing Python package dependencies to path {path_to_dependencies_files}" - ) - os.makedirs(path_to_dependencies_files) - for component in component_files: - self._create_dependencies_files_for_single_component( - component, path_to_dependencies_files - ) - return id - - def _create_dependencies_files_for_single_component( - self, component, path_to_dependencies_files - ) -> None: - component_repo = Path(component).parent - with open(component, "r") as spec_file: - spec = YAML(typ="safe").load(spec_file) - ( - pip_dependencies, - conda_dependencies, - _, - ) = self._extract_dependencies_and_channels(component) - - if pip_dependencies or conda_dependencies: - component_name = spec.get("name") - cur_path = os.path.join(path_to_dependencies_files, component_name) - try: - os.makedirs(cur_path) - except FileExistsError: - suffix = ( - component_name - + "_" - + os.path.splitext(os.path.basename(component))[0] - ) - cur_path = os.path.join(path_to_dependencies_files, suffix) - os.makedirs(cur_path) - if pip_dependencies: - log.info( - f"Found pip dependencies for component {component_name} in {component_repo}. Writing to requirements.txt." - ) - with open(os.path.join(cur_path, "requirements.txt"), "w") as file: - for req in pip_dependencies: - file.write(req) - if not req.endswith("\n"): - file.write("\n") - if conda_dependencies: - log.info( - f"Found conda dependencies for component {component_name} in {component_repo}. Writing to environment.yml." - ) - with open(os.path.join(cur_path, "environment.yml"), "w") as file: - yaml.dump(conda_dependencies, file) - - def _extract_dependencies_and_channels(self, component) -> List[list]: - component_repo = Path(component).parent - build_folder = os.path.join(component_repo, ".build") - if os.path.exists(build_folder): - component_repo = build_folder - with open(component, "r") as spec_file: - spec = YAML(typ="safe").load(spec_file) - pip_dependencies = [] - conda_dependencies = [] - conda_channels = [] - if "environment" in spec: - spec_environment = spec.get("environment") - if "conda" in spec_environment: - spec_conda = spec_environment["conda"] - if "conda_dependencies" in spec_conda: - conda_dependencies = spec_conda["conda_dependencies"] - pip_dependencies += self._extract_python_package_dependencies( - conda_dependencies - ) - if "channels" in conda_dependencies: - conda_channels += conda_dependencies["channels"] - if "conda_dependencies_file" in spec_conda: - conda_dependencies_file = spec_conda["conda_dependencies_file"] - try: - with open( - os.path.join( - component_repo, spec_conda["conda_dependencies_file"] - ) - ) as file: - conda_dependencies = YAML(typ="safe").load(file) - pip_dependencies += self._extract_python_package_dependencies( - conda_dependencies - ) - if "channels" in conda_dependencies: - conda_channels += conda_dependencies["channels"] - except FileNotFoundError: - self.register_error( - f"The required conda_dependencies_file {conda_dependencies_file} does not exist in {component_repo}." - ) - if "pip_requirements_file" in spec_conda: - pip_requirements_file = spec_conda["pip_requirements_file"] - try: - with open( - os.path.join( - component_repo, spec_conda["pip_requirements_file"] - ) - ) as file: - pip_dependencies += file.readlines() - except FileNotFoundError: - self.register_error( - f"The required pip_requirements_file {pip_requirements_file} does not exist in {component_repo}." - ) - return [pip_dependencies, conda_dependencies, conda_channels] - - def _extract_python_package_dependencies(self, conda_dependencies) -> List[str]: - pip_dependencies = [] - if "dependencies" in conda_dependencies: - dependencies = conda_dependencies.get("dependencies") - for dependencies_item in dependencies: - if isinstance(dependencies_item, dict) and "pip" in dependencies_item: - pip_dependencies = dependencies_item["pip"] - return pip_dependencies - - def create_catalog_files(self, files: List[str]) -> None: - """ - Create AML-friendly catalog.json and catalog.json.sig files, using - SHA-256 hash. - """ - - # For each component spec file in the input list, we'll do the following... - for f in files: - log.info(f"Processing file {f}") - component_folder_path = self.folder_path(f) - - # remove catalog files if already present - log.info("Deleting old catalog files if present") - delete_two_catalog_files(component_folder_path) - - files_for_catalog = self.all_files_in_snapshot(f) - log.info("The following list of files will be added to the catalog.") - log.info(files_for_catalog) - - # Prepare the catlog stub: {'HashAlgorithm': 'SHA256', 'CatalogItems': {}} - catalog = create_catalog_stub() - - # Add an entry to the catalog for each file - for file_for_catalog in files_for_catalog: - catalog = add_file_to_catalog( - file_for_catalog, catalog, component_folder_path - ) - - # order the CatalogItems dictionary - catalog["CatalogItems"] = collections.OrderedDict( - sorted(catalog["CatalogItems"].items()) - ) - - # Write the 2 catalog files - log.info(catalog) - write_two_catalog_files(catalog, component_folder_path) - log.info("Finished creating catalog files.") - - def run_with_config(self): - log.info("Running component preparation logic.") - - - component_files = self.find_component_specification_files() - - self.ensure_component_cli_installed() - self.attach_workspace() - - self.create_catalog_files(component_files) - - self._create_dependencies_files(component_files) - - def validate_each_components(self, component) -> None: - """ - For one of component specification file, run `az ml component validate`, - run compliance and customized validation if enabled, - and register the status (+ register error if validation failed). - """ - validate_component_success = self.execute_azure_cli_command( - f"ml component validate --file {component}" - ) - compliance_validation_success = True - customized_validation_success = True - if self.config.enable_component_validation: - log.info(f"Running compliance validation on {component}") - compliance_validation_success = self.compliance_validation(component) - if len(self.config.component_validation) > 0: - log.info(f"Running customized validation on {component}") - for jsonpath, regex in self.config.component_validation.items(): - customized_validation_success = ( - customized_validation_success - if self.customized_validation( - jsonpath, - regex, - component, - self.config.fail_if_pattern_not_found_in_component_validation, - ) - else False - ) - - if ( - validate_component_success - and compliance_validation_success - and customized_validation_success - ): - # If the az ml validation succeeds, we continue to check whether - # the "code" snapshot parameter is specified in the spec file - # https://componentsdk.z22.web.core.windows.net/components/component-spec-topics/code-snapshot.html - with open(component, "r") as spec_file: - spec = YAML(typ="safe").load(spec_file) - spec_code = spec.get("code") - if spec_code and spec_code not in [".", "./"]: - self.register_component_status(component, "validate", "failed") - self.register_error( - "Code snapshot parameter is not supported. Please use .additional_includes for your component." - ) - else: - log.info(f"Component {component} is valid.") - self.register_component_status(component, "validate", "succeeded") - else: - self.register_component_status(component, "validate", "failed") - self.register_error(f"Error when validating component {component}.") - - def compliance_validation(self, component: str) -> bool: - """ - This function checks whether a given component spec YAML file - meets all the requirements for running in the compliant AML. - Specifically, it checks (1) whether the image URL is compliant; - (2)whether the pip index-url is compliant; (3) whether - "default" is only Conda channel - """ - with open(component, "r") as spec_file: - spec = YAML(typ="safe").load(spec_file) - - # Check whether the docker image URL is compliant - image_url = jsonpath_ng.parse("$.environment.docker.image").find(spec) - if len(image_url) > 0: - if ( - urlparse(image_url[0].value).path.split("/")[0] - not in ALLOWED_CONTAINER_REGISTRIES - ): - log.error( - f"The container base image in {component} is not allowed for compliant run." - ) - return False - - # check whether the package feed is compliant - ( - package_dependencies, - conda_dependencies, - conda_channels, - ) = self._extract_dependencies_and_channels(component=component) - if len(package_dependencies) > 0: - for dependency in package_dependencies: - if re.match("^--index-url", dependency) or re.match( - "^--extra-index-url", dependency - ): - if dependency.split(" ")[1] not in ALLOWED_PACKAGE_FEEDS: - log.error( - f"The package feed in {component} is not allowed for compliant run." - ) - return False - if ( - f"--index-url {ALLOWED_PACKAGE_FEEDS[0]}" not in package_dependencies - and f"--extra-index-url {ALLOWED_PACKAGE_FEEDS[0]}" - not in package_dependencies - ): - log.error( - f"The Polymer package feed is not found in environment of {component}" - ) - return False - - # Check whether "default" is only Conda channel - if len(conda_channels) > 1 or ( - len(conda_channels) == 1 and conda_channels[0] != "." - ): - log.error("Only the default conda channel is allowed for compliant run.") - return False - - return True - - @staticmethod - def customized_validation( - jsonpath: str, - regex: str, - component: str, - fail_if_pattern_not_found_in_component_validation: bool, - ) -> bool: - """ - This function leverages regular expressionm atching and - JSONPath expression to enforce user-provided "strict" - validation on Azure ML components - """ - with open(component, "r") as spec_file: - spec = YAML(typ="safe").load(spec_file) - - parsed_patterns = jsonpath_ng.parse(jsonpath).find(spec) - validation_success = True - if not parsed_patterns: - log.warning(f"The pattern {jsonpath} is not found in {component}") - if fail_if_pattern_not_found_in_component_validation: - validation_success = False - if len(parsed_patterns) > 0: - for parsed_pattern in parsed_patterns: - if not re.match(regex, parsed_pattern.value): - log.error( - f"The parsed pattern {parsed_pattern} in {component} doesn't match the regular expression {regex}" - ) - validation_success = False - return validation_success - - -if __name__ == "__main__": - Prepare().run() diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py index 0bcfc95adbd6..9e2e5023b416 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py @@ -240,12 +240,26 @@ def _parse_name_version( return ":".join(name), version -def _get_file_hash(filename: Union[str, os.PathLike], _hash: hash_type) -> hash_type: +def _get_file_hash(filename: Union[str, os.PathLike], _hash: hash_type, chunk_size: int = 1024) -> hash_type: with open(str(filename), "rb") as f: - for chunk in iter(lambda: f.read(CHUNK_SIZE), b""): + for chunk in iter(lambda: f.read(chunk_size), b""): _hash.update(chunk) return _hash +def delete_two_catalog_files(path): + """ + Function that deletes the "catalog.json" and "catalog.json.sig" files located at 'path', if they exist + """ + # catalog.json + file_path_json = os.path.join(path, "catalog.json") + if os.path.exists(file_path_json): + module_logger.warning(f"{file_path_json} already exists. Deleting it") + os.remove(file_path_json) + # catalog.json.sig + file_path_json_sig = os.path.join(path, "catalog.json.sig") + if os.path.exists(file_path_json_sig): + module_logger.warning(f"{file_path_json_sig} already exists. Deleting it") + os.remove(file_path_json_sig) def _get_dir_hash(directory: Union[str, os.PathLike], _hash: hash_type, ignore_file: IgnoreFile) -> hash_type: dir_contents = Path(directory).iterdir() diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py index bcd4fa4fa99d..c9133f6859ae 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py @@ -4,12 +4,16 @@ # pylint: disable=protected-access,too-many-lines import time +import collections +import json +import os import types from functools import partial from inspect import Parameter, signature from os import PathLike from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, cast +import hashlib from azure.ai.ml._restclient.v2021_10_01_dataplanepreview import ( AzureMachineLearningWorkspaces as ServiceClient102021Dataplane, @@ -26,9 +30,14 @@ from azure.ai.ml._utils._asset_utils import ( _archive_or_restore, _create_or_update_autoincrement, + _get_file_hash, _get_latest, _get_next_version_from_container, _resolve_label_to_asset, + get_ignore_file, + get_upload_files_from_folder, + IgnoreFile, + delete_two_catalog_files ) from azure.ai.ml._utils._azureml_polling import AzureMLPolling from azure.ai.ml._utils._endpoint_utils import polling_wait @@ -652,6 +661,33 @@ def create_or_update( ) return component + @experimental + def prepare_for_sign(self, component: Component): + ignore_file = IgnoreFile() + + if isinstance(component, ComponentCodeMixin): + with component._build_code() as code: + delete_two_catalog_files(code.path) + ignore_file = get_ignore_file(code.path) if code._ignore_file is None else ignore_file + file_list = get_upload_files_from_folder(code.path, ignore_file=ignore_file) + json_stub = {} + json_stub["HashAlgorithm"] = "SHA256" + json_stub["CatalogItems"] = {} + + for file_path, file_name in sorted(file_list, key=lambda x: str(x[1]).lower()): + file_hash = _get_file_hash(file_path, hashlib.sha256(), 4096).hexdigest().upper() + json_stub["CatalogItems"][file_name] = file_hash + + json_stub["CatalogItems"] = collections.OrderedDict( + sorted(json_stub["CatalogItems"].items()) + ) + + print(type(json_stub), type (json_stub["CatalogItems"])) + with open(os.path.join(code.path, "catalog.json"), "w") as jsonFile1: + json.dump(json_stub, jsonFile1) + with open(os.path.join(code.path, "catalog.json.sig"), "w") as jsonFile2: + json.dump(json_stub, jsonFile2) + @monitor_with_telemetry_mixin(ops_logger, "Component.Archive", ActivityType.PUBLICAPI) def archive( self, diff --git a/sdk/ml/azure-ai-ml/samples/hello.py b/sdk/ml/azure-ai-ml/samples/hello.py new file mode 100644 index 000000000000..547ff11c3f95 --- /dev/null +++ b/sdk/ml/azure-ai-ml/samples/hello.py @@ -0,0 +1,34 @@ +import argparse +import os +from datetime import datetime + +parser = argparse.ArgumentParser() +parser.add_argument("--componentB_input", type=str) +parser.add_argument("--componentB_output", type=str) + +print("Hello Python World...\nI'm componentB :-)") + +args = parser.parse_args() + +print("componentB_input path: %s" % args.componentB_input) +print("componentB_output path: %s" % args.componentB_output) + +print("files in input path: ") +arr = os.listdir(args.componentB_input) +print(arr) + +for filename in arr: + print("reading file: %s ..." % filename) + with open(os.path.join(args.componentB_input, filename), "r") as handle: + print(handle.read()) + +cur_time_str = datetime.now().strftime("%b-%d-%Y-%H-%M-%S") + +print( + "Writing file: %s" + % os.path.join(args.componentB_output, "file-" + cur_time_str + ".txt") +) +with open( + os.path.join(args.componentB_output, "file-" + cur_time_str + ".txt"), "wt" +) as text_file: + print(f"Logging date time: {cur_time_str}", file=text_file) \ No newline at end of file diff --git a/sdk/ml/azure-ai-ml/samples/job.yaml b/sdk/ml/azure-ai-ml/samples/job.yaml new file mode 100644 index 000000000000..b8fb5ea3c8a7 --- /dev/null +++ b/sdk/ml/azure-ai-ml/samples/job.yaml @@ -0,0 +1,20 @@ +$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json +code: ../src +command: >- + python main.py train_check --config ${{inputs.data}}/model.yaml --train ${{inputs.data}}/train.csv --sanity-check ${{inputs.data}}/sanity_check.csv --min-accuracy 0.99 --min-precision 0.95 --min-recall 0.95 --model-dir ${{outputs.model}} +inputs: + data: + path: . + mode: download +outputs: + model: + type: uri_folder +environment: + image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 + conda_file: ../src/environment.yml +environment_variables: + AZUREML_COMMON_RUNTIME_USE_SBOM_CAPABILITY: "true" +compute: azureml:gpu-t4-spot-vpn +display_name: Compete +experiment_name: sensei-compete +description: Sensei Compete Model \ No newline at end of file diff --git a/sdk/ml/azure-ai-ml/samples/ml_samples_test_prepForSign.py b/sdk/ml/azure-ai-ml/samples/ml_samples_test_prepForSign.py new file mode 100644 index 000000000000..c8472aa1233a --- /dev/null +++ b/sdk/ml/azure-ai-ml/samples/ml_samples_test_prepForSign.py @@ -0,0 +1,25 @@ +from azure.identity import DefaultAzureCredential, AzureCliCredential, InteractiveBrowserCredential +from azure.ai.ml import MLClient, load_job +from azure.ai.ml.entities import Data, ManagedOnlineEndpoint, Job, CommandComponent +from azure.ai.ml.sweep import SweepJob, GridSamplingAlgorithm, Choice, Objective +from azure.ai.ml import command +from azure.ai.ml.constants import AssetTypes +from azure.ai.ml.entities._load_functions import load_component + +subscription_id = "2d385bf4-0756-4a76-aa95-28bf9ed3b625" +resource_group = "sdkv2-20240925-rg" +workspace_name = "sdkv2-20240925-ws" + + +credential = DefaultAzureCredential() + +print(credential) +ml_client = MLClient( + credential=credential, + subscription_id=subscription_id, + resource_group_name=resource_group, + workspace_name=workspace_name, +) + +component = load_component("C:\\Projects\\azure-sdk-for-python\\sdk\\ml\\azure-ai-ml\\azure\\ai\\ml\\YAMLsigning\\sum1.yaml") +ml_client.components.prepare_for_sign(component) \ No newline at end of file diff --git a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py index 4c812cc31c66..9470ebf82cc9 100644 --- a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py +++ b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations.py @@ -21,7 +21,6 @@ ) from azure.ai.ml.operations import WorkspaceOperations from azure.core.polling import LROPoller -import urllib.parse @pytest.fixture diff --git a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py index 0744655d9b5c..07254188f348 100644 --- a/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py +++ b/sdk/ml/azure-ai-ml/tests/workspace/unittests/test_workspace_operations_base.py @@ -32,7 +32,6 @@ ) from azure.ai.ml.operations._workspace_operations_base import WorkspaceOperationsBase from azure.core.polling import LROPoller -import urllib.parse @pytest.fixture From d69b9e59fad3ea07c96ccccf592e08663941b2e7 Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Tue, 31 Dec 2024 21:55:06 +0530 Subject: [PATCH 13/14] resolving comments --- .../azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py | 12 ++++++++++-- .../ai/ml/operations/_component_operations.py | 14 ++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py index 9e2e5023b416..367caa54610e 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py @@ -7,6 +7,7 @@ import hashlib import logging import os +import json import uuid import warnings from concurrent.futures import ThreadPoolExecutor, as_completed @@ -240,9 +241,9 @@ def _parse_name_version( return ":".join(name), version -def _get_file_hash(filename: Union[str, os.PathLike], _hash: hash_type, chunk_size: int = 1024) -> hash_type: +def _get_file_hash(filename: Union[str, os.PathLike], _hash: hash_type) -> hash_type: with open(str(filename), "rb") as f: - for chunk in iter(lambda: f.read(chunk_size), b""): + for chunk in iter(lambda: f.read(CHUNK_SIZE), b""): _hash.update(chunk) return _hash @@ -261,6 +262,13 @@ def delete_two_catalog_files(path): module_logger.warning(f"{file_path_json_sig} already exists. Deleting it") os.remove(file_path_json_sig) +def create_catalog_files(path, json_stub): + with open(os.path.join(path, "catalog.json"), "w") as jsonFile1: + json.dump(json_stub, jsonFile1) + with open(os.path.join(path, "catalog.json.sig"), "w") as jsonFile2: + json.dump(json_stub, jsonFile2) + + def _get_dir_hash(directory: Union[str, os.PathLike], _hash: hash_type, ignore_file: IgnoreFile) -> hash_type: dir_contents = Path(directory).iterdir() sorted_contents = sorted(dir_contents, key=lambda path: str(path).lower()) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py index c9133f6859ae..959a82179674 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py @@ -5,8 +5,6 @@ # pylint: disable=protected-access,too-many-lines import time import collections -import json -import os import types from functools import partial from inspect import Parameter, signature @@ -37,7 +35,8 @@ get_ignore_file, get_upload_files_from_folder, IgnoreFile, - delete_two_catalog_files + delete_two_catalog_files, + create_catalog_files ) from azure.ai.ml._utils._azureml_polling import AzureMLPolling from azure.ai.ml._utils._endpoint_utils import polling_wait @@ -675,19 +674,14 @@ def prepare_for_sign(self, component: Component): json_stub["CatalogItems"] = {} for file_path, file_name in sorted(file_list, key=lambda x: str(x[1]).lower()): - file_hash = _get_file_hash(file_path, hashlib.sha256(), 4096).hexdigest().upper() + file_hash = _get_file_hash(file_path, hashlib.sha256()).hexdigest().upper() json_stub["CatalogItems"][file_name] = file_hash json_stub["CatalogItems"] = collections.OrderedDict( sorted(json_stub["CatalogItems"].items()) ) + create_catalog_files(code.path, json_stub) - print(type(json_stub), type (json_stub["CatalogItems"])) - with open(os.path.join(code.path, "catalog.json"), "w") as jsonFile1: - json.dump(json_stub, jsonFile1) - with open(os.path.join(code.path, "catalog.json.sig"), "w") as jsonFile2: - json.dump(json_stub, jsonFile2) - @monitor_with_telemetry_mixin(ops_logger, "Component.Archive", ActivityType.PUBLICAPI) def archive( self, From 997a7eda5529af9297cadda4ec00ac430adc6671 Mon Sep 17 00:00:00 2001 From: kshitij-microsoft Date: Thu, 2 Jan 2025 15:35:33 +0530 Subject: [PATCH 14/14] fixing pylint and black --- .../azure/ai/ml/_utils/_asset_utils.py | 44 +++++++++-- .../ai/ml/operations/_component_operations.py | 73 ++++++++++++++----- sdk/ml/azure-ai-ml/samples/hello.py | 11 +-- .../samples/ml_samples_test_prepForSign.py | 12 ++- 4 files changed, 101 insertions(+), 39 deletions(-) diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py index 367caa54610e..22478c0ae52b 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/_utils/_asset_utils.py @@ -16,7 +16,17 @@ from os import PathLike from pathlib import Path from platform import system -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Iterable, + List, + Optional, + Tuple, + Union, + cast, +) from colorama import Fore from tqdm import TqdmWarning, tqdm @@ -57,7 +67,11 @@ from azure.ai.ml._restclient.v2023_04_01.models import PendingUploadRequestDto from azure.ai.ml._utils._pathspec import GitWildMatchPattern, normalize_file from azure.ai.ml._utils.utils import convert_windows_path_to_unix, retry, snake_to_camel -from azure.ai.ml.constants._common import MAX_AUTOINCREMENT_ATTEMPTS, DefaultOpenEncoding, OrderString +from azure.ai.ml.constants._common import ( + MAX_AUTOINCREMENT_ATTEMPTS, + DefaultOpenEncoding, + OrderString, +) from azure.ai.ml.entities._assets.asset import Asset from azure.ai.ml.exceptions import ( AssetPathException, @@ -247,25 +261,31 @@ def _get_file_hash(filename: Union[str, os.PathLike], _hash: hash_type) -> hash_ _hash.update(chunk) return _hash + def delete_two_catalog_files(path): """ Function that deletes the "catalog.json" and "catalog.json.sig" files located at 'path', if they exist + + :param path: Path to the folder for signing + :type path: Union[Path, str] + :return: None """ # catalog.json file_path_json = os.path.join(path, "catalog.json") if os.path.exists(file_path_json): - module_logger.warning(f"{file_path_json} already exists. Deleting it") + module_logger.warning("%s already exists. Deleting it", file_path_json) os.remove(file_path_json) # catalog.json.sig file_path_json_sig = os.path.join(path, "catalog.json.sig") if os.path.exists(file_path_json_sig): - module_logger.warning(f"{file_path_json_sig} already exists. Deleting it") + module_logger.warning("%s already exists. Deleting it", file_path_json_sig) os.remove(file_path_json_sig) + def create_catalog_files(path, json_stub): - with open(os.path.join(path, "catalog.json"), "w") as jsonFile1: + with open(os.path.join(path, "catalog.json"), "w", encoding=DefaultOpenEncoding.WRITE) as jsonFile1: json.dump(json_stub, jsonFile1) - with open(os.path.join(path, "catalog.json.sig"), "w") as jsonFile2: + with open(os.path.join(path, "catalog.json.sig"), "w", encoding=DefaultOpenEncoding.WRITE) as jsonFile2: json.dump(json_stub, jsonFile2) @@ -371,7 +391,10 @@ def get_content_hash(path: Union[str, os.PathLike], ignore_file: IgnoreFile = Ig def get_upload_files_from_folder( - path: Union[str, os.PathLike], *, prefix: str = "", ignore_file: IgnoreFile = IgnoreFile() + path: Union[str, os.PathLike], + *, + prefix: str = "", + ignore_file: IgnoreFile = IgnoreFile(), ) -> List[str]: path = Path(path) upload_paths = [] @@ -454,7 +477,12 @@ def traverse_directory( # pylint: disable=unused-argument result = [] for origin_file_path in origin_file_paths: relative_path = origin_file_path.relative_to(root) - result.append((_resolve_path(origin_file_path).as_posix(), Path(prefix).joinpath(relative_path).as_posix())) + result.append( + ( + _resolve_path(origin_file_path).as_posix(), + Path(prefix).joinpath(relative_path).as_posix(), + ) + ) return result diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py index 959a82179674..79530a092fc1 100644 --- a/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py +++ b/sdk/ml/azure-ai-ml/azure/ai/ml/operations/_component_operations.py @@ -16,15 +16,24 @@ from azure.ai.ml._restclient.v2021_10_01_dataplanepreview import ( AzureMachineLearningWorkspaces as ServiceClient102021Dataplane, ) -from azure.ai.ml._restclient.v2024_01_01_preview import AzureMachineLearningWorkspaces as ServiceClient012024 -from azure.ai.ml._restclient.v2024_01_01_preview.models import ComponentVersion, ListViewType +from azure.ai.ml._restclient.v2024_01_01_preview import ( + AzureMachineLearningWorkspaces as ServiceClient012024, +) +from azure.ai.ml._restclient.v2024_01_01_preview.models import ( + ComponentVersion, + ListViewType, +) from azure.ai.ml._scope_dependent_operations import ( OperationConfig, OperationsContainer, OperationScope, _ScopeDependentOperations, ) -from azure.ai.ml._telemetry import ActivityType, monitor_with_activity, monitor_with_telemetry_mixin +from azure.ai.ml._telemetry import ( + ActivityType, + monitor_with_activity, + monitor_with_telemetry_mixin, +) from azure.ai.ml._utils._asset_utils import ( _archive_or_restore, _create_or_update_autoincrement, @@ -36,7 +45,7 @@ get_upload_files_from_folder, IgnoreFile, delete_two_catalog_files, - create_catalog_files + create_catalog_files, ) from azure.ai.ml._utils._azureml_polling import AzureMLPolling from azure.ai.ml._utils._endpoint_utils import polling_wait @@ -50,7 +59,12 @@ LROConfigurations, ) from azure.ai.ml.entities import Component, ValidationResult -from azure.ai.ml.exceptions import ComponentException, ErrorCategory, ErrorTarget, ValidationException +from azure.ai.ml.exceptions import ( + ComponentException, + ErrorCategory, + ErrorTarget, + ValidationException, +) from azure.core.exceptions import HttpResponseError, ResourceNotFoundError from .._utils._cache_utils import CachedNodeResolver @@ -290,7 +304,8 @@ def _localize_code(self, component: Component, base_dir: Path) -> None: target_code_value = "./code" self._code_operations.download( - **extract_name_and_version(code), download_path=base_dir.joinpath(target_code_value) + **extract_name_and_version(code), + download_path=base_dir.joinpath(target_code_value), ) setattr(component, component._get_code_field_name(), target_code_value) @@ -319,7 +334,13 @@ def _localize_environment(self, component: Component, base_dir: Path) -> None: @experimental @monitor_with_telemetry_mixin(ops_logger, "Component.Download", ActivityType.PUBLICAPI) - def download(self, name: str, download_path: Union[PathLike, str] = ".", *, version: Optional[str] = None) -> None: + def download( + self, + name: str, + download_path: Union[PathLike, str] = ".", + *, + version: Optional[str] = None, + ) -> None: """Download the specified component and its dependencies to local. Local component can be used to create the component in another workspace or for offline development. @@ -499,7 +520,11 @@ def _reset_version_if_no_change(self, component: Component, current_name: str, c return current_version, rest_component_resource def _create_or_update_component_version( - self, component: Component, name: str, version: Optional[str], rest_component_resource: Any + self, + component: Component, + name: str, + version: Optional[str], + rest_component_resource: Any, ) -> Any: try: if self._registry_name: @@ -663,7 +688,7 @@ def create_or_update( @experimental def prepare_for_sign(self, component: Component): ignore_file = IgnoreFile() - + if isinstance(component, ComponentCodeMixin): with component._build_code() as code: delete_two_catalog_files(code.path) @@ -671,17 +696,17 @@ def prepare_for_sign(self, component: Component): file_list = get_upload_files_from_folder(code.path, ignore_file=ignore_file) json_stub = {} json_stub["HashAlgorithm"] = "SHA256" - json_stub["CatalogItems"] = {} - + json_stub["CatalogItems"] = {} # type: ignore + for file_path, file_name in sorted(file_list, key=lambda x: str(x[1]).lower()): file_hash = _get_file_hash(file_path, hashlib.sha256()).hexdigest().upper() - json_stub["CatalogItems"][file_name] = file_hash - - json_stub["CatalogItems"] = collections.OrderedDict( - sorted(json_stub["CatalogItems"].items()) + json_stub["CatalogItems"][file_name] = file_hash # type: ignore + + json_stub["CatalogItems"] = collections.OrderedDict( # type: ignore + sorted(json_stub["CatalogItems"].items()) # type: ignore ) create_catalog_files(code.path, json_stub) - + @monitor_with_telemetry_mixin(ops_logger, "Component.Archive", ActivityType.PUBLICAPI) def archive( self, @@ -890,7 +915,9 @@ def _resolve_binding_on_supported_fields_for_node(cls, node: BaseNode) -> None: :param node: The node :type node: BaseNode """ - from azure.ai.ml.entities._job.pipeline._attr_dict import try_get_non_arbitrary_attr + from azure.ai.ml.entities._job.pipeline._attr_dict import ( + try_get_non_arbitrary_attr, + ) from azure.ai.ml.entities._job.pipeline._io import PipelineInput # compute binding to pipeline input is supported on node. @@ -998,7 +1025,9 @@ def _try_resolve_compute_for_node(cls, node: BaseNode, _: str, resolver: _AssetR @classmethod def _divide_nodes_to_resolve_into_layers( - cls, component: PipelineComponent, extra_operations: List[Callable[[BaseNode, str], Any]] + cls, + component: PipelineComponent, + extra_operations: List[Callable[[BaseNode, str], Any]], ) -> List: """Traverse the pipeline component and divide nodes to resolve into layers. Note that all leaf nodes will be put in the last layer. @@ -1059,7 +1088,8 @@ def _divide_nodes_to_resolve_into_layers( def _get_workspace_key(self) -> str: try: workspace_rest = self._workspace_operations._operation.get( - resource_group_name=self._resource_group_name, workspace_name=self._workspace_name + resource_group_name=self._resource_group_name, + workspace_name=self._workspace_name, ) return str(workspace_rest.workspace_id) except HttpResponseError: @@ -1129,7 +1159,10 @@ def _resolve_dependencies_for_pipeline_component_jobs( extra_operations=[ # no need to do this as we now keep the original component name for anonymous components # self._set_default_display_name_for_anonymous_component_in_node, - partial(self._try_resolve_node_level_task_for_parallel_node, resolver=resolver), + partial( + self._try_resolve_node_level_task_for_parallel_node, + resolver=resolver, + ), partial(self._try_resolve_environment_for_component, resolver=resolver), partial(self._try_resolve_compute_for_node, resolver=resolver), # should we resolve code here after we do extra operations concurrently? diff --git a/sdk/ml/azure-ai-ml/samples/hello.py b/sdk/ml/azure-ai-ml/samples/hello.py index 547ff11c3f95..85011c4fe0bc 100644 --- a/sdk/ml/azure-ai-ml/samples/hello.py +++ b/sdk/ml/azure-ai-ml/samples/hello.py @@ -24,11 +24,6 @@ cur_time_str = datetime.now().strftime("%b-%d-%Y-%H-%M-%S") -print( - "Writing file: %s" - % os.path.join(args.componentB_output, "file-" + cur_time_str + ".txt") -) -with open( - os.path.join(args.componentB_output, "file-" + cur_time_str + ".txt"), "wt" -) as text_file: - print(f"Logging date time: {cur_time_str}", file=text_file) \ No newline at end of file +print("Writing file: %s" % os.path.join(args.componentB_output, "file-" + cur_time_str + ".txt")) +with open(os.path.join(args.componentB_output, "file-" + cur_time_str + ".txt"), "wt") as text_file: + print(f"Logging date time: {cur_time_str}", file=text_file) diff --git a/sdk/ml/azure-ai-ml/samples/ml_samples_test_prepForSign.py b/sdk/ml/azure-ai-ml/samples/ml_samples_test_prepForSign.py index c8472aa1233a..3aef29d15cf6 100644 --- a/sdk/ml/azure-ai-ml/samples/ml_samples_test_prepForSign.py +++ b/sdk/ml/azure-ai-ml/samples/ml_samples_test_prepForSign.py @@ -1,4 +1,8 @@ -from azure.identity import DefaultAzureCredential, AzureCliCredential, InteractiveBrowserCredential +from azure.identity import ( + DefaultAzureCredential, + AzureCliCredential, + InteractiveBrowserCredential, +) from azure.ai.ml import MLClient, load_job from azure.ai.ml.entities import Data, ManagedOnlineEndpoint, Job, CommandComponent from azure.ai.ml.sweep import SweepJob, GridSamplingAlgorithm, Choice, Objective @@ -21,5 +25,7 @@ workspace_name=workspace_name, ) -component = load_component("C:\\Projects\\azure-sdk-for-python\\sdk\\ml\\azure-ai-ml\\azure\\ai\\ml\\YAMLsigning\\sum1.yaml") -ml_client.components.prepare_for_sign(component) \ No newline at end of file +component = load_component( + "C:\\Projects\\azure-sdk-for-python\\sdk\\ml\\azure-ai-ml\\azure\\ai\\ml\\YAMLsigning\\sum1.yaml" +) +ml_client.components.prepare_for_sign(component)