diff --git a/.github/workflows/pipeline-e2e-test.yaml b/.github/workflows/pipeline-e2e-test.yaml index 93de8401..8fbf55da 100644 --- a/.github/workflows/pipeline-e2e-test.yaml +++ b/.github/workflows/pipeline-e2e-test.yaml @@ -190,6 +190,24 @@ jobs: source-file-path: "mlops/bicep/modules/fl_pairs/open_aks_with_confcomp_storage_pair.bicep" target-file-path: "mlops/arm/open_aks_with_confcomp_storage_pair.json" + - name: Test open aks storage pair ARM template + uses: ./.github/actions/bicep-to-arm-template-diff + with: + source-file-path: "mlops/bicep/modules/fl_pairs/open_aks_storage_pair.bicep" + target-file-path: "mlops/arm/open_aks_storage_pair.json" + + - name: Test vnet aks confcomp storage pair ARM template + uses: ./.github/actions/bicep-to-arm-template-diff + with: + source-file-path: "mlops/bicep/modules/fl_pairs/vnet_aks_with_confcomp_storage_pair.bicep" + target-file-path: "mlops/arm/vnet_aks_with_confcomp_storage_pair.json" + + - name: Test vnet aks storage pair ARM template + uses: ./.github/actions/bicep-to-arm-template-diff + with: + source-file-path: "mlops/bicep/modules/fl_pairs/vnet_aks_storage_pair.bicep" + target-file-path: "mlops/arm/vnet_aks_storage_pair.json" + - name: Test jumpbox ARM template uses: ./.github/actions/bicep-to-arm-template-diff with: diff --git a/mlops/arm/open_aks_storage_pair.json b/mlops/arm/open_aks_storage_pair.json new file mode 100644 index 00000000..057f1fc9 --- /dev/null +++ b/mlops/arm/open_aks_storage_pair.json @@ -0,0 +1,983 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "16393347625341881573" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "pairRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Specifies the location of the pair resources." + } + }, + "pairBaseName": { + "type": "string", + "metadata": { + "description": "Base name used for creating all pair resources." + } + }, + "storageAccountName": { + "type": "string", + "defaultValue": "[replace(format('st{0}', parameters('pairBaseName')), '-', '')]", + "metadata": { + "description": "Name of the storage account resource to create for the pair" + } + }, + "datastoreName": { + "type": "string", + "defaultValue": "[replace(format('datastore_{0}', parameters('pairBaseName')), '-', '_')]", + "metadata": { + "description": "Name of the datastore for attaching the storage to the AzureML workspace." + } + }, + "aksClusterName": { + "type": "string", + "defaultValue": "[format('aks-{0}', parameters('pairBaseName'))]", + "metadata": { + "description": "The name of the Managed Cluster resource." + } + }, + "computeSKU": { + "type": "string", + "defaultValue": "Standard_DS3_v2", + "metadata": { + "description": "VM size for the compute cluster." + } + }, + "computeIsGPU": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Boolean to indicate if the compute cluster should be a GPU cluster" + } + }, + "computeNodes": { + "type": "int", + "defaultValue": 4, + "maxValue": 50, + "minValue": 1, + "metadata": { + "description": "VM nodes for the compute cluster" + } + }, + "uaiName": { + "type": "string", + "defaultValue": "[format('uai-{0}', parameters('aksClusterName'))]", + "metadata": { + "description": "Name of the UAI for the pair compute cluster" + } + }, + "applyDefaultPermissions": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Allow compute cluster to access storage account with R/W permissions (using UAI)" + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to curate the resources in Azure." + } + } + }, + "variables": { + "aksClusterNameClean": "[substring(parameters('aksClusterName'), 0, min(length(parameters('aksClusterName')), 16))]" + }, + "resources": [ + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-open-storage', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "machineLearningName": { + "value": "[parameters('machineLearningName')]" + }, + "machineLearningRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "storageName": { + "value": "[parameters('storageAccountName')]" + }, + "storageRegion": { + "value": "[parameters('pairRegion')]" + }, + "datastoreName": { + "value": "[parameters('datastoreName')]" + }, + "publicNetworkAccess": { + "value": "Enabled" + }, + "tags": { + "value": "[parameters('tags')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "8073065165220131475" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "storageName": { + "type": "string", + "metadata": { + "description": "Name of the storage account" + } + }, + "storageRegion": { + "type": "string", + "metadata": { + "description": "Azure region of the storage to create" + } + }, + "storageSKU": { + "type": "string", + "defaultValue": "Standard_LRS", + "metadata": { + "description": "Storage SKU" + }, + "allowedValues": [ + "Standard_LRS", + "Standard_ZRS", + "Standard_GRS", + "Standard_GZRS", + "Standard_RAGRS", + "Standard_RAGZRS", + "Premium_LRS", + "Premium_ZRS" + ] + }, + "containerName": { + "type": "string", + "defaultValue": "private", + "metadata": { + "description": "Name of the storage container resource to create for the pair" + } + }, + "datastoreName": { + "type": "string", + "defaultValue": "[replace(format('datastore_{0}', parameters('storageName')), '-', '_')]", + "metadata": { + "description": "Name of the datastore for attaching the storage to the AzureML workspace." + } + }, + "subnetIds": { + "type": "array", + "defaultValue": [], + "metadata": { + "description": "Resource ID of the subnets allowed into this storage" + } + }, + "publicNetworkAccess": { + "type": "string", + "defaultValue": "Disabled", + "metadata": { + "description": "Allow or disallow public network access to Storage Account." + }, + "allowedValues": [ + "Enabled", + "vNetOnly", + "Disabled" + ] + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to add to the resources" + } + } + }, + "variables": { + "storageNameCleaned": "[replace(parameters('storageName'), '-', '')]", + "storageAccountCleanName": "[substring(variables('storageNameCleaned'), 0, min(length(variables('storageNameCleaned')), 24))]", + "storageAllowedSubnetIds": "[if(equals(parameters('publicNetworkAccess'), 'Enabled'), createArray(), parameters('subnetIds'))]", + "storagedefaultAction": "[if(equals(parameters('publicNetworkAccess'), 'Enabled'), 'Allow', 'Deny')]", + "storagepublicNetworkAccess": "[if(equals(parameters('publicNetworkAccess'), 'Disabled'), 'Disabled', 'Enabled')]" + }, + "resources": [ + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2022-05-01", + "name": "[variables('storageAccountCleanName')]", + "location": "[parameters('storageRegion')]", + "tags": "[parameters('tags')]", + "sku": { + "name": "[parameters('storageSKU')]" + }, + "kind": "StorageV2", + "properties": { + "accessTier": "Hot", + "allowBlobPublicAccess": false, + "allowCrossTenantReplication": false, + "allowedCopyScope": "PrivateLink", + "allowSharedKeyAccess": true, + "networkAcls": { + "copy": [ + { + "name": "virtualNetworkRules", + "count": "[length(variables('storageAllowedSubnetIds'))]", + "input": { + "id": "[variables('storageAllowedSubnetIds')[copyIndex('virtualNetworkRules')]]", + "action": "Allow" + } + } + ], + "bypass": "AzureServices", + "defaultAction": "[variables('storagedefaultAction')]", + "resourceAccessRules": [] + }, + "publicNetworkAccess": "[variables('storagepublicNetworkAccess')]", + "routingPreference": { + "routingChoice": "MicrosoftRouting", + "publishMicrosoftEndpoints": true + }, + "encryption": { + "keySource": "Microsoft.Storage", + "requireInfrastructureEncryption": false, + "services": { + "blob": { + "enabled": true, + "keyType": "Account" + }, + "file": { + "enabled": true, + "keyType": "Account" + }, + "queue": { + "enabled": true, + "keyType": "Service" + }, + "table": { + "enabled": true, + "keyType": "Service" + } + } + }, + "isHnsEnabled": false, + "isNfsV3Enabled": false, + "isLocalUserEnabled": false, + "isSftpEnabled": false, + "keyPolicy": { + "keyExpirationPeriodInDays": 7 + }, + "largeFileSharesState": "Disabled", + "minimumTlsVersion": "TLS1_2", + "supportsHttpsTrafficOnly": true + } + }, + { + "type": "Microsoft.Storage/storageAccounts/blobServices/containers", + "apiVersion": "2022-05-01", + "name": "[format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName'))]", + "properties": { + "metadata": {}, + "publicAccess": "None" + }, + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountCleanName'))]" + ] + }, + { + "type": "Microsoft.MachineLearningServices/workspaces/datastores", + "apiVersion": "2022-06-01-preview", + "name": "[format('{0}/{1}', parameters('machineLearningName'), parameters('datastoreName'))]", + "properties": { + "tags": "[parameters('tags')]", + "credentials": { + "credentialsType": "None" + }, + "description": "[format('Private storage in region {0}', parameters('storageRegion'))]", + "properties": {}, + "datastoreType": "AzureBlob", + "accountName": "[variables('storageAccountCleanName')]", + "containerName": "[parameters('containerName')]", + "resourceGroup": "[resourceGroup().name]", + "subscriptionId": "[subscription().subscriptionId]" + }, + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts/blobServices/containers', split(format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName')), '/')[0], split(format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName')), '/')[1], split(format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName')), '/')[2])]", + "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountCleanName'))]" + ] + } + ], + "outputs": { + "storageId": { + "type": "string", + "value": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountCleanName'))]" + }, + "storageName": { + "type": "string", + "value": "[variables('storageAccountCleanName')]" + }, + "containerName": { + "type": "string", + "value": "[format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName'))]" + }, + "datastoreName": { + "type": "string", + "value": "[format('{0}/{1}', parameters('machineLearningName'), parameters('datastoreName'))]" + } + } + } + } + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-open-aks-confcomp', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "machineLearningName": { + "value": "[parameters('machineLearningName')]" + }, + "machineLearningRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "aksClusterName": { + "value": "[variables('aksClusterNameClean')]" + }, + "amlComputeName": { + "value": "[variables('aksClusterNameClean')]" + }, + "computeRegion": { + "value": "[parameters('pairRegion')]" + }, + "agentVMSize": { + "value": "[parameters('computeSKU')]" + }, + "computeIsGPU": { + "value": "[parameters('computeIsGPU')]" + }, + "agentCount": { + "value": "[parameters('computeNodes')]" + }, + "computeUaiName": { + "value": "[parameters('uaiName')]" + }, + "tags": { + "value": "[parameters('tags')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "7058483199152929415" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "aksClusterName": { + "type": "string", + "metadata": { + "description": "The name of the Managed Cluster resource." + } + }, + "amlComputeName": { + "type": "string", + "defaultValue": "[parameters('aksClusterName')]", + "metadata": { + "description": "How to name this compute in Azure ML" + } + }, + "computeRegion": { + "type": "string", + "metadata": { + "description": "Specifies the location of the compute resources." + } + }, + "dnsPrefix": { + "type": "string", + "defaultValue": "[replace(format('dnxprefix-{0}', parameters('aksClusterName')), '-', '')]", + "maxLength": 54, + "metadata": { + "description": "Optional DNS prefix to use with hosted Kubernetes API server FQDN." + } + }, + "agentCount": { + "type": "int", + "defaultValue": 4, + "maxValue": 50, + "minValue": 1, + "metadata": { + "description": "The number of nodes for the cluster pool." + } + }, + "agentVMSize": { + "type": "string", + "defaultValue": "Standard_DS3_v2", + "metadata": { + "description": "The size of the Virtual Machine." + } + }, + "computeIsGPU": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Boolean to indicate if the compute cluster should be a GPU cluster" + } + }, + "osDiskSizeGB": { + "type": "int", + "defaultValue": 0, + "maxValue": 1023, + "minValue": 0, + "metadata": { + "description": "Disk size (in GB) to provision for each of the agent pool nodes. This value ranges from 0 to 1023. Specifying 0 will apply the default disk size for that agentVMSize." + } + }, + "computeUaiName": { + "type": "string", + "defaultValue": "[format('uai-{0}', parameters('aksClusterName'))]", + "metadata": { + "description": "Name of the UAI for the compute cluster." + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to curate the resources in Azure." + } + } + }, + "variables": { + "userAssignedIdentities": { + "[format('/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{2}', subscription().subscriptionId, resourceGroup().name, parameters('computeUaiName'))]": {} + } + }, + "resources": [ + { + "type": "Microsoft.ManagedIdentity/userAssignedIdentities", + "apiVersion": "2022-01-31-preview", + "name": "[parameters('computeUaiName')]", + "location": "[parameters('computeRegion')]", + "tags": "[parameters('tags')]" + }, + { + "type": "Microsoft.ContainerService/managedClusters", + "apiVersion": "2022-05-02-preview", + "name": "[parameters('aksClusterName')]", + "location": "[parameters('computeRegion')]", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": "[variables('userAssignedIdentities')]" + }, + "properties": { + "dnsPrefix": "[parameters('dnsPrefix')]", + "agentPoolProfiles": [ + { + "name": "compool", + "count": "[parameters('agentCount')]", + "vmSize": "[parameters('agentVMSize')]", + "osType": "Linux", + "mode": "System", + "osDiskSizeGB": "[parameters('osDiskSizeGB')]" + } + ], + "apiServerAccessProfile": { + "authorizedIPRanges": [], + "enablePrivateCluster": false, + "enablePrivateClusterPublicFQDN": false, + "enableVnetIntegration": false + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('computeUaiName'))]" + ] + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('deploy-aml-extension-{0}', parameters('aksClusterName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "clusterName": { + "value": "[parameters('aksClusterName')]" + }, + "installNvidiaDevicePlugin": { + "value": "[parameters('computeIsGPU')]" + }, + "installDcgmExporter": { + "value": "[parameters('computeIsGPU')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "5119649280313860161" + } + }, + "parameters": { + "clusterName": { + "type": "string", + "metadata": { + "description": "Name of the AKS cluster in the resource group." + } + }, + "clusterAdminUAIName": { + "type": "string", + "defaultValue": "[format('uai-admin-{0}', parameters('clusterName'))]", + "metadata": { + "description": "DeploymentScript needs a UAI with permissions to install extension on AKS." + } + }, + "createNewAdminIdentity": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Create or reuse an existing UAI (see clusterAdminUAIName)." + } + }, + "extensionDeploymentName": { + "type": "string", + "defaultValue": "azmlext" + }, + "enableTraining": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Must be set to True for AzureML extension deployment with Machine Learning model training and batch scoring support." + } + }, + "enableInference": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Must be set to True for AzureML extension deployment with Machine Learning inference support." + } + }, + "inferenceRouterServiceType": { + "type": "string", + "defaultValue": "loadBalancer", + "allowedValues": [ + "loadBalancer", + "nodePort", + "clusterIP" + ], + "metadata": { + "description": "Required if enableInference=True." + } + }, + "allowInsecureConnections": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Can be set to True to use inference HTTP endpoints for development or test purposes." + } + }, + "internalLoadBalancerProvider": { + "type": "string", + "defaultValue": "azure", + "metadata": { + "description": "Set to azure to allow the inference router using internal load balancer. This config is only applicable for Azure Kubernetes Service(AKS) cluster now." + } + }, + "inferenceLoadBalancerHA": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "To ensure high availability of azureml-fe routing service (for clusters with 3 nodes or more)." + } + }, + "installNvidiaDevicePlugin": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "To enable ML workloads on NVIDIA GPU hardware." + } + }, + "installPromOp": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "AzureML extension needs prometheus operator to manage prometheus. Set to False to reuse the existing prometheus operator." + } + }, + "installVolcano": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "AzureML extension needs volcano scheduler to schedule the job. Set to False to reuse existing volcano scheduler." + } + }, + "installDcgmExporter": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Dcgm-exporter can expose GPU metrics for AzureML workloads, which can be monitored in Azure portal. Set installDcgmExporter to True to install dcgm-exporter." + } + } + }, + "variables": { + "aksExtensionInstallRoleId": "b24988ac-6180-42a0-ab88-20f7382dd24c" + }, + "resources": [ + { + "condition": "[parameters('createNewAdminIdentity')]", + "type": "Microsoft.ManagedIdentity/userAssignedIdentities", + "apiVersion": "2022-01-31-preview", + "name": "[parameters('clusterAdminUAIName')]", + "location": "[resourceGroup().location]" + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2022-04-01", + "scope": "[format('Microsoft.ContainerService/managedClusters/{0}', parameters('clusterName'))]", + "name": "[guid(resourceGroup().id, resourceId('Microsoft.ContainerService/managedClusters', parameters('clusterName')), variables('aksExtensionInstallRoleId'))]", + "properties": { + "roleDefinitionId": "[format('/subscriptions/{0}/providers/Microsoft.Authorization/roleDefinitions/{1}', subscription().subscriptionId, variables('aksExtensionInstallRoleId'))]", + "principalId": "[if(parameters('createNewAdminIdentity'), reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName')), '2022-01-31-preview').principalId, reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName')), '2022-01-31-preview').principalId)]", + "principalType": "ServicePrincipal" + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName'))]" + ] + }, + { + "type": "Microsoft.Resources/deploymentScripts", + "apiVersion": "2020-10-01", + "name": "[format('deploy-aks-azureml-extensions-to-{0}', parameters('clusterName'))]", + "location": "[resourceGroup().location]", + "kind": "AzureCLI", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": { + "[format('/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{2}', subscription().subscriptionId, resourceGroup().name, parameters('clusterAdminUAIName'))]": {} + } + }, + "properties": { + "azCliVersion": "2.40.0", + "cleanupPreference": "OnSuccess", + "retentionInterval": "P1D", + "scriptContent": "[format('az extension add --name k8s-extension; az k8s-extension create --name {0} --extension-type Microsoft.AzureML.Kubernetes --config enableTraining={1} enableInference={2} inferenceRouterServiceType={3} allowInsecureConnections={4} internalLoadBalancerProvider={5} inferenceLoadBalancerHA={6} installNvidiaDevicePlugin={7} installPromOp={8} installVolcano={9} installDcgmExporter={10} --cluster-type managedClusters --cluster-name {11} --scope cluster --resource-group {12}', parameters('extensionDeploymentName'), parameters('enableTraining'), parameters('enableInference'), parameters('inferenceRouterServiceType'), parameters('allowInsecureConnections'), parameters('internalLoadBalancerProvider'), parameters('inferenceLoadBalancerHA'), parameters('installNvidiaDevicePlugin'), parameters('installPromOp'), parameters('installVolcano'), parameters('installDcgmExporter'), parameters('clusterName'), resourceGroup().name)]", + "timeout": "P1D" + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName'))]" + ] + } + ] + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters', parameters('aksClusterName'))]" + ] + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('attach-{0}-to-aml-{1}', parameters('aksClusterName'), parameters('machineLearningName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "machineLearningName": { + "value": "[parameters('machineLearningName')]" + }, + "machineLearningRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "aksResourceId": { + "value": "[resourceId('Microsoft.ContainerService/managedClusters', parameters('aksClusterName'))]" + }, + "aksRegion": { + "value": "[reference(resourceId('Microsoft.ContainerService/managedClusters', parameters('aksClusterName')), '2022-05-02-preview', 'full').location]" + }, + "amlComputeName": { + "value": "[parameters('amlComputeName')]" + }, + "computeUaiName": { + "value": "[parameters('computeUaiName')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "13309880359552726590" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "aksResourceId": { + "type": "string", + "metadata": { + "description": "Resource ID of the AKS cluster." + } + }, + "aksRegion": { + "type": "string", + "metadata": { + "description": "Region of the AKS cluster." + } + }, + "amlComputeName": { + "type": "string", + "metadata": { + "description": "How to name this compute in Azure ML" + } + }, + "computeUaiName": { + "type": "string", + "metadata": { + "description": "Name of the existing UAI for the compute cluster." + } + } + }, + "variables": { + "userAssignedIdentities": { + "[format('/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{2}', subscription().subscriptionId, resourceGroup().name, parameters('computeUaiName'))]": {} + } + }, + "resources": [ + { + "type": "Microsoft.MachineLearningServices/workspaces/computes", + "apiVersion": "2021-01-01", + "name": "[format('{0}/{1}', parameters('machineLearningName'), parameters('amlComputeName'))]", + "location": "[parameters('machineLearningRegion')]", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": "[variables('userAssignedIdentities')]" + }, + "properties": { + "computeType": "Kubernetes", + "computeLocation": "[parameters('aksRegion')]", + "resourceId": "[parameters('aksResourceId')]", + "description": "AKS cluster attached to AzureML workspace", + "properties": {} + } + } + ], + "outputs": { + "identityPrincipalId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('computeUaiName')), '2022-01-31-preview').principalId]" + }, + "compute": { + "type": "string", + "value": "[parameters('amlComputeName')]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters', parameters('aksClusterName'))]", + "[resourceId('Microsoft.Resources/deployments', format('deploy-aml-extension-{0}', parameters('aksClusterName')))]" + ] + } + ], + "outputs": { + "identityPrincipalId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('computeUaiName')), '2022-01-31-preview').principalId]" + }, + "compute": { + "type": "string", + "value": "[parameters('amlComputeName')]" + }, + "region": { + "type": "string", + "value": "[parameters('computeRegion')]" + }, + "aksControlPlaneFQDN": { + "type": "string", + "value": "[reference(resourceId('Microsoft.ContainerService/managedClusters', parameters('aksClusterName')), '2022-05-02-preview').fqdn]" + }, + "aksId": { + "type": "string", + "value": "[resourceId('Microsoft.ContainerService/managedClusters', parameters('aksClusterName'))]" + } + } + } + } + }, + { + "condition": "[parameters('applyDefaultPermissions')]", + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-internal-rw-perms', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "storageAccountName": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-open-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value]" + }, + "identityPrincipalId": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-open-aks-confcomp', parameters('pairBaseName'))), '2020-10-01').outputs.identityPrincipalId.value]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "7765934047439622108" + } + }, + "parameters": { + "storageAccountName": { + "type": "string", + "metadata": { + "description": "Full path to storage" + } + }, + "identityPrincipalId": { + "type": "string", + "metadata": { + "description": "PrincipalId of the managed identity" + } + }, + "computeToStorageRoles": { + "type": "array", + "defaultValue": [ + "ba92f5b4-2d11-453d-a403-e96b0029c9fe", + "81a9662b-bebf-436f-a333-f67b29880f12", + "c12c1c16-33a1-487b-954d-41c89c60f349" + ], + "metadata": { + "description": "Role definition IDs for the compute towards the internal storage" + } + } + }, + "resources": [ + { + "copy": { + "name": "roleAssignments", + "count": "[length(parameters('computeToStorageRoles'))]" + }, + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2022-04-01", + "scope": "[format('Microsoft.Storage/storageAccounts/{0}', parameters('storageAccountName'))]", + "name": "[guid(resourceGroup().id, resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), parameters('identityPrincipalId'), parameters('computeToStorageRoles')[copyIndex()])]", + "properties": { + "roleDefinitionId": "[format('/subscriptions/{0}/providers/Microsoft.Authorization/roleDefinitions/{1}', subscription().subscriptionId, parameters('computeToStorageRoles')[copyIndex()])]", + "principalId": "[parameters('identityPrincipalId')]", + "principalType": "ServicePrincipal" + } + } + ] + } + }, + "dependsOn": [ + "[resourceId('Microsoft.Resources/deployments', format('{0}-open-aks-confcomp', parameters('pairBaseName')))]", + "[resourceId('Microsoft.Resources/deployments', format('{0}-open-storage', parameters('pairBaseName')))]" + ] + } + ], + "outputs": { + "identityPrincipalId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-open-aks-confcomp', parameters('pairBaseName'))), '2020-10-01').outputs.identityPrincipalId.value]" + }, + "storageName": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-open-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value]" + }, + "storageServiceId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-open-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageId.value]" + }, + "computeName": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-open-aks-confcomp', parameters('pairBaseName'))), '2020-10-01').outputs.compute.value]" + }, + "region": { + "type": "string", + "value": "[parameters('pairRegion')]" + } + } +} \ No newline at end of file diff --git a/mlops/arm/vnet_aks_storage_pair.json b/mlops/arm/vnet_aks_storage_pair.json new file mode 100644 index 00000000..da689fad --- /dev/null +++ b/mlops/arm/vnet_aks_storage_pair.json @@ -0,0 +1,1664 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "4692535087462189406" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "pairRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Specifies the location of the pair resources." + } + }, + "pairBaseName": { + "type": "string", + "metadata": { + "description": "Base name used for creating all pair resources." + } + }, + "storageAccountName": { + "type": "string", + "defaultValue": "[replace(format('st{0}', parameters('pairBaseName')), '-', '')]", + "metadata": { + "description": "Name of the storage account resource to create for the pair" + } + }, + "datastoreName": { + "type": "string", + "defaultValue": "[replace(format('datastore_{0}', parameters('pairBaseName')), '-', '_')]", + "metadata": { + "description": "Name of the datastore for attaching the storage to the AzureML workspace." + } + }, + "computeName": { + "type": "string", + "defaultValue": "[format('{0}-01', parameters('pairBaseName'))]", + "metadata": { + "description": "Name of the default compute cluster for the pair" + } + }, + "computeSKU": { + "type": "string", + "defaultValue": "Standard_DS3_v2", + "metadata": { + "description": "VM size for the compute cluster" + } + }, + "computeIsGPU": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Boolean to indicate if the compute cluster should be a GPU cluster" + } + }, + "computeNodes": { + "type": "int", + "defaultValue": 4, + "metadata": { + "description": "VM nodes for the default compute cluster" + } + }, + "uaiName": { + "type": "string", + "defaultValue": "[format('uai-{0}', parameters('pairBaseName'))]", + "metadata": { + "description": "Name of the UAI for the pair compute cluster" + } + }, + "nsgResourceName": { + "type": "string", + "defaultValue": "[format('nsg-{0}', parameters('pairBaseName'))]", + "metadata": { + "description": "Name of the Network Security Group resource" + } + }, + "vnetResourceName": { + "type": "string", + "defaultValue": "[format('vnet-{0}', parameters('pairBaseName'))]", + "metadata": { + "description": "Name of the vNET resource" + } + }, + "vnetAddressPrefix": { + "type": "string", + "metadata": { + "description": "Virtual network address prefix" + } + }, + "subnetPrefix": { + "type": "string", + "metadata": { + "description": "Subnet address prefix" + } + }, + "useStorageStaticIP": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Use a static ip for storage PLE" + } + }, + "storagePLEStaticIP": { + "type": "string", + "defaultValue": "172.19.0.50", + "metadata": { + "description": "Which static IP to use for storage PLE (if useStorageStaticIP is true)" + } + }, + "subnetName": { + "type": "string", + "defaultValue": "snet-training", + "metadata": { + "description": "Subnet name" + } + }, + "allowedSubnetIds": { + "type": "array", + "defaultValue": [], + "metadata": { + "description": "Allow other subnets into the storage (need to be in the same region)" + } + }, + "enableNodePublicIp": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Enable compute node public IP" + } + }, + "storagePublicNetworkAccess": { + "type": "string", + "defaultValue": "Disabled", + "metadata": { + "description": "Allow or disallow public network access to Storage Account." + }, + "allowedValues": [ + "Enabled", + "vNetOnly", + "Disabled" + ] + }, + "applyDefaultPermissions": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Allow compute cluster to access storage account with R/W permissions (using UAI)" + } + }, + "blobPrivateDNSZoneName": { + "type": "string", + "defaultValue": "[format('privatelink.blob.{0}', environment().suffixes.storage)]", + "metadata": { + "description": "Name of the private DNS zone for blob" + } + }, + "blobPrivateDNSZoneLocation": { + "type": "string", + "defaultValue": "global", + "metadata": { + "description": "Location of the private DNS zone for blob" + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to curate the resources in Azure." + } + } + }, + "resources": [ + { + "type": "Microsoft.ManagedIdentity/userAssignedIdentities", + "apiVersion": "2022-01-31-preview", + "name": "[parameters('uaiName')]", + "location": "[parameters('pairRegion')]", + "tags": "[parameters('tags')]" + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-deployment', parameters('nsgResourceName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "location": { + "value": "[parameters('pairRegion')]" + }, + "nsgName": { + "value": "[parameters('nsgResourceName')]" + }, + "tags": { + "value": "[parameters('tags')]" + }, + "workspaceRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "enableNodePublicIp": { + "value": "[parameters('enableNodePublicIp')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "17553148586584182485" + } + }, + "parameters": { + "location": { + "type": "string", + "metadata": { + "description": "Azure region of the deployment" + } + }, + "workspaceRegion": { + "type": "string", + "metadata": { + "description": "Region of the AzureML workspace" + } + }, + "tags": { + "type": "object", + "metadata": { + "description": "Tags to add to the resources" + } + }, + "nsgName": { + "type": "string", + "metadata": { + "description": "Name of the network security group" + } + }, + "enableNodePublicIp": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Set rules to allow for compute with public IP" + } + } + }, + "resources": [ + { + "type": "Microsoft.Network/networkSecurityGroups", + "apiVersion": "2022-07-01", + "name": "[parameters('nsgName')]", + "location": "[parameters('location')]", + "tags": "[parameters('tags')]" + }, + { + "condition": "[parameters('enableNodePublicIp')]", + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'AzureMLPublicIPInbound')]", + "properties": { + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRange": "44224", + "sourceAddressPrefix": "AzureMachineLearning", + "destinationAddressPrefix": "*", + "access": "Allow", + "priority": 130, + "direction": "Inbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'AzureMLOutboundTcp')]", + "properties": { + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRanges": [ + "443", + "8787", + "18881" + ], + "sourceAddressPrefix": "*", + "destinationAddressPrefix": "AzureMachineLearning", + "access": "Allow", + "priority": 150, + "direction": "Outbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'AzureMLOutboundUdp')]", + "properties": { + "protocol": "Udp", + "sourcePortRange": "*", + "destinationPortRange": "5831", + "sourceAddressPrefix": "*", + "destinationAddressPrefix": "AzureMachineLearning", + "access": "Allow", + "priority": 151, + "direction": "Outbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'BatchNodeManagementOutbound')]", + "properties": { + "protocol": "*", + "sourcePortRange": "*", + "destinationPortRange": "443", + "sourceAddressPrefix": "*", + "destinationAddressPrefix": "[format('BatchNodeManagement.{0}', parameters('workspaceRegion'))]", + "access": "Allow", + "priority": 152, + "direction": "Outbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'AzureStorageAccount')]", + "properties": { + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRange": "443", + "sourceAddressPrefix": "*", + "destinationAddressPrefix": "[format('Storage.{0}', parameters('workspaceRegion'))]", + "access": "Allow", + "priority": 143, + "direction": "Outbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + } + ], + "outputs": { + "id": { + "type": "string", + "value": "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + } + } + } + } + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-deployment', parameters('vnetResourceName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "location": { + "value": "[parameters('pairRegion')]" + }, + "virtualNetworkName": { + "value": "[parameters('vnetResourceName')]" + }, + "networkSecurityGroupId": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('nsgResourceName'))), '2020-10-01').outputs.id.value]" + }, + "vnetAddressPrefix": { + "value": "[parameters('vnetAddressPrefix')]" + }, + "subnets": { + "value": [ + { + "name": "[parameters('subnetName')]", + "addressPrefix": "[parameters('subnetPrefix')]" + } + ] + }, + "tags": { + "value": "[parameters('tags')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "6565408118333802825" + } + }, + "parameters": { + "location": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Azure region of the deployment" + } + }, + "virtualNetworkName": { + "type": "string", + "metadata": { + "description": "Name of the virtual network resource" + } + }, + "networkSecurityGroupId": { + "type": "string", + "metadata": { + "description": "Group ID of the network security group" + } + }, + "vnetAddressPrefix": { + "type": "string", + "defaultValue": "192.168.0.0/16", + "metadata": { + "description": "Virtual network address prefix" + } + }, + "subnets": { + "type": "array", + "defaultValue": [ + { + "name": "snet-training", + "addressPrefix": "192.168.0.0/24" + } + ], + "metadata": { + "description": "Training subnets names and address prefix" + } + }, + "serviceEndpoints": { + "type": "array", + "defaultValue": [ + "Microsoft.KeyVault", + "Microsoft.ContainerRegistry", + "Microsoft.Storage" + ], + "metadata": { + "description": "List of service endpoints expected on this vnet" + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to add to the resources" + } + } + }, + "variables": { + "copy": [ + { + "name": "serviceEndpointsDefinition", + "count": "[length(parameters('serviceEndpoints'))]", + "input": { + "service": "[parameters('serviceEndpoints')[copyIndex('serviceEndpointsDefinition')]]" + } + }, + { + "name": "subnetsDefinition", + "count": "[length(parameters('subnets'))]", + "input": { + "name": "[parameters('subnets')[copyIndex('subnetsDefinition')].name]", + "properties": { + "addressPrefix": "[parameters('subnets')[copyIndex('subnetsDefinition')].addressPrefix]", + "privateEndpointNetworkPolicies": "Disabled", + "privateLinkServiceNetworkPolicies": "Disabled", + "serviceEndpoints": "[variables('serviceEndpointsDefinition')]", + "networkSecurityGroup": { + "id": "[parameters('networkSecurityGroupId')]" + } + } + } + } + ] + }, + "resources": [ + { + "type": "Microsoft.Network/virtualNetworks", + "apiVersion": "2022-01-01", + "name": "[parameters('virtualNetworkName')]", + "location": "[parameters('location')]", + "tags": "[parameters('tags')]", + "properties": { + "addressSpace": { + "addressPrefixes": [ + "[parameters('vnetAddressPrefix')]" + ] + }, + "subnets": "[variables('subnetsDefinition')]" + } + } + ], + "outputs": { + "id": { + "type": "string", + "value": "[resourceId('Microsoft.Network/virtualNetworks', parameters('virtualNetworkName'))]" + }, + "name": { + "type": "string", + "value": "[parameters('virtualNetworkName')]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('nsgResourceName')))]" + ] + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-vnet-aml-compute', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "machineLearningName": { + "value": "[parameters('machineLearningName')]" + }, + "machineLearningRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "computeName": { + "value": "[parameters('computeName')]" + }, + "computeRegion": { + "value": "[parameters('pairRegion')]" + }, + "agentVMSize": { + "value": "[parameters('computeSKU')]" + }, + "computeIsGPU": { + "value": "[parameters('computeIsGPU')]" + }, + "agentCount": { + "value": "[parameters('computeNodes')]" + }, + "computeUaiName": { + "value": "[parameters('uaiName')]" + }, + "subnetName": { + "value": "[parameters('subnetName')]" + }, + "subnetId": { + "value": "[format('{0}/subnets/{1}', reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value, parameters('subnetName'))]" + }, + "tags": { + "value": "[parameters('tags')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "10258756419234445375" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "computeName": { + "type": "string", + "metadata": { + "description": "The name of the Managed Cluster resource." + } + }, + "computeRegion": { + "type": "string", + "metadata": { + "description": "Specifies the location of the compute resources." + } + }, + "dnsPrefix": { + "type": "string", + "defaultValue": "[replace(format('dnxprefix-{0}', parameters('computeName')), '-', '')]", + "maxLength": 54, + "metadata": { + "description": "Optional DNS prefix to use with hosted Kubernetes API server FQDN." + } + }, + "agentVMSize": { + "type": "string", + "defaultValue": "Standard_DS3_v2", + "metadata": { + "description": "The size of the Virtual Machine." + } + }, + "computeIsGPU": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Boolean to indicate if the compute cluster should be a GPU cluster" + } + }, + "agentCount": { + "type": "int", + "defaultValue": 2, + "maxValue": 50, + "minValue": 1, + "metadata": { + "description": "The number of nodes for the cluster pool." + } + }, + "osDiskSizeGB": { + "type": "int", + "defaultValue": 0, + "maxValue": 1023, + "minValue": 0, + "metadata": { + "description": "Disk size (in GB) to provision for each of the agent pool nodes. This value ranges from 0 to 1023. Specifying 0 will apply the default disk size for that agentVMSize." + } + }, + "computeUaiName": { + "type": "string", + "metadata": { + "description": "Name of the UAI for the compute cluster" + } + }, + "subnetId": { + "type": "string", + "metadata": { + "description": "Subnet ID" + } + }, + "subnetName": { + "type": "string", + "defaultValue": "snet-training", + "metadata": { + "description": "Subnet name" + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to curate the resources in Azure." + } + } + }, + "variables": { + "userAssignedIdentities": { + "[format('/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{2}', subscription().subscriptionId, resourceGroup().name, parameters('computeUaiName'))]": {} + } + }, + "resources": [ + { + "type": "Microsoft.ContainerService/managedClusters", + "apiVersion": "2022-05-02-preview", + "name": "[parameters('computeName')]", + "location": "[parameters('computeRegion')]", + "tags": "[parameters('tags')]", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": "[variables('userAssignedIdentities')]" + }, + "properties": { + "dnsPrefix": "[parameters('dnsPrefix')]", + "agentPoolProfiles": [ + { + "name": "compool", + "count": "[parameters('agentCount')]", + "vmSize": "[parameters('agentVMSize')]", + "osType": "Linux", + "mode": "System", + "osDiskSizeGB": "[parameters('osDiskSizeGB')]", + "vnetSubnetID": "[parameters('subnetId')]" + } + ], + "apiServerAccessProfile": { + "authorizedIPRanges": [], + "enablePrivateCluster": false, + "enablePrivateClusterPublicFQDN": false, + "enableVnetIntegration": false + }, + "networkProfile": { + "networkPlugin": "azure" + } + } + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('deploy-aml-extension-{0}', parameters('computeName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "clusterName": { + "value": "[parameters('computeName')]" + }, + "installNvidiaDevicePlugin": { + "value": "[parameters('computeIsGPU')]" + }, + "installDcgmExporter": { + "value": "[parameters('computeIsGPU')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "5119649280313860161" + } + }, + "parameters": { + "clusterName": { + "type": "string", + "metadata": { + "description": "Name of the AKS cluster in the resource group." + } + }, + "clusterAdminUAIName": { + "type": "string", + "defaultValue": "[format('uai-admin-{0}', parameters('clusterName'))]", + "metadata": { + "description": "DeploymentScript needs a UAI with permissions to install extension on AKS." + } + }, + "createNewAdminIdentity": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Create or reuse an existing UAI (see clusterAdminUAIName)." + } + }, + "extensionDeploymentName": { + "type": "string", + "defaultValue": "azmlext" + }, + "enableTraining": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Must be set to True for AzureML extension deployment with Machine Learning model training and batch scoring support." + } + }, + "enableInference": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Must be set to True for AzureML extension deployment with Machine Learning inference support." + } + }, + "inferenceRouterServiceType": { + "type": "string", + "defaultValue": "loadBalancer", + "allowedValues": [ + "loadBalancer", + "nodePort", + "clusterIP" + ], + "metadata": { + "description": "Required if enableInference=True." + } + }, + "allowInsecureConnections": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Can be set to True to use inference HTTP endpoints for development or test purposes." + } + }, + "internalLoadBalancerProvider": { + "type": "string", + "defaultValue": "azure", + "metadata": { + "description": "Set to azure to allow the inference router using internal load balancer. This config is only applicable for Azure Kubernetes Service(AKS) cluster now." + } + }, + "inferenceLoadBalancerHA": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "To ensure high availability of azureml-fe routing service (for clusters with 3 nodes or more)." + } + }, + "installNvidiaDevicePlugin": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "To enable ML workloads on NVIDIA GPU hardware." + } + }, + "installPromOp": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "AzureML extension needs prometheus operator to manage prometheus. Set to False to reuse the existing prometheus operator." + } + }, + "installVolcano": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "AzureML extension needs volcano scheduler to schedule the job. Set to False to reuse existing volcano scheduler." + } + }, + "installDcgmExporter": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Dcgm-exporter can expose GPU metrics for AzureML workloads, which can be monitored in Azure portal. Set installDcgmExporter to True to install dcgm-exporter." + } + } + }, + "variables": { + "aksExtensionInstallRoleId": "b24988ac-6180-42a0-ab88-20f7382dd24c" + }, + "resources": [ + { + "condition": "[parameters('createNewAdminIdentity')]", + "type": "Microsoft.ManagedIdentity/userAssignedIdentities", + "apiVersion": "2022-01-31-preview", + "name": "[parameters('clusterAdminUAIName')]", + "location": "[resourceGroup().location]" + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2022-04-01", + "scope": "[format('Microsoft.ContainerService/managedClusters/{0}', parameters('clusterName'))]", + "name": "[guid(resourceGroup().id, resourceId('Microsoft.ContainerService/managedClusters', parameters('clusterName')), variables('aksExtensionInstallRoleId'))]", + "properties": { + "roleDefinitionId": "[format('/subscriptions/{0}/providers/Microsoft.Authorization/roleDefinitions/{1}', subscription().subscriptionId, variables('aksExtensionInstallRoleId'))]", + "principalId": "[if(parameters('createNewAdminIdentity'), reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName')), '2022-01-31-preview').principalId, reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName')), '2022-01-31-preview').principalId)]", + "principalType": "ServicePrincipal" + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName'))]" + ] + }, + { + "type": "Microsoft.Resources/deploymentScripts", + "apiVersion": "2020-10-01", + "name": "[format('deploy-aks-azureml-extensions-to-{0}', parameters('clusterName'))]", + "location": "[resourceGroup().location]", + "kind": "AzureCLI", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": { + "[format('/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{2}', subscription().subscriptionId, resourceGroup().name, parameters('clusterAdminUAIName'))]": {} + } + }, + "properties": { + "azCliVersion": "2.40.0", + "cleanupPreference": "OnSuccess", + "retentionInterval": "P1D", + "scriptContent": "[format('az extension add --name k8s-extension; az k8s-extension create --name {0} --extension-type Microsoft.AzureML.Kubernetes --config enableTraining={1} enableInference={2} inferenceRouterServiceType={3} allowInsecureConnections={4} internalLoadBalancerProvider={5} inferenceLoadBalancerHA={6} installNvidiaDevicePlugin={7} installPromOp={8} installVolcano={9} installDcgmExporter={10} --cluster-type managedClusters --cluster-name {11} --scope cluster --resource-group {12}', parameters('extensionDeploymentName'), parameters('enableTraining'), parameters('enableInference'), parameters('inferenceRouterServiceType'), parameters('allowInsecureConnections'), parameters('internalLoadBalancerProvider'), parameters('inferenceLoadBalancerHA'), parameters('installNvidiaDevicePlugin'), parameters('installPromOp'), parameters('installVolcano'), parameters('installDcgmExporter'), parameters('clusterName'), resourceGroup().name)]", + "timeout": "P1D" + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName'))]" + ] + } + ] + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters', parameters('computeName'))]" + ] + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('attach-{0}-to-aml-{1}', parameters('computeName'), parameters('machineLearningName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "machineLearningName": { + "value": "[parameters('machineLearningName')]" + }, + "machineLearningRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "aksResourceId": { + "value": "[resourceId('Microsoft.ContainerService/managedClusters', parameters('computeName'))]" + }, + "aksRegion": { + "value": "[reference(resourceId('Microsoft.ContainerService/managedClusters', parameters('computeName')), '2022-05-02-preview', 'full').location]" + }, + "amlComputeName": { + "value": "[parameters('computeName')]" + }, + "computeUaiName": { + "value": "[parameters('computeUaiName')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "13309880359552726590" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "aksResourceId": { + "type": "string", + "metadata": { + "description": "Resource ID of the AKS cluster." + } + }, + "aksRegion": { + "type": "string", + "metadata": { + "description": "Region of the AKS cluster." + } + }, + "amlComputeName": { + "type": "string", + "metadata": { + "description": "How to name this compute in Azure ML" + } + }, + "computeUaiName": { + "type": "string", + "metadata": { + "description": "Name of the existing UAI for the compute cluster." + } + } + }, + "variables": { + "userAssignedIdentities": { + "[format('/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{2}', subscription().subscriptionId, resourceGroup().name, parameters('computeUaiName'))]": {} + } + }, + "resources": [ + { + "type": "Microsoft.MachineLearningServices/workspaces/computes", + "apiVersion": "2021-01-01", + "name": "[format('{0}/{1}', parameters('machineLearningName'), parameters('amlComputeName'))]", + "location": "[parameters('machineLearningRegion')]", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": "[variables('userAssignedIdentities')]" + }, + "properties": { + "computeType": "Kubernetes", + "computeLocation": "[parameters('aksRegion')]", + "resourceId": "[parameters('aksResourceId')]", + "description": "AKS cluster attached to AzureML workspace", + "properties": {} + } + } + ], + "outputs": { + "identityPrincipalId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('computeUaiName')), '2022-01-31-preview').principalId]" + }, + "compute": { + "type": "string", + "value": "[parameters('amlComputeName')]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters', parameters('computeName'))]", + "[resourceId('Microsoft.Resources/deployments', format('deploy-aml-extension-{0}', parameters('computeName')))]" + ] + } + ], + "outputs": { + "identityPrincipalId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('computeUaiName')), '2022-01-31-preview').principalId]" + }, + "compute": { + "type": "string", + "value": "[parameters('computeName')]" + }, + "region": { + "type": "string", + "value": "[parameters('computeRegion')]" + }, + "subnetName": { + "type": "string", + "value": "[parameters('subnetName')]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('uaiName'))]", + "[resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName')))]" + ] + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-vnet-storage', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "machineLearningName": { + "value": "[parameters('machineLearningName')]" + }, + "machineLearningRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "storageName": { + "value": "[parameters('storageAccountName')]" + }, + "storageRegion": { + "value": "[parameters('pairRegion')]" + }, + "datastoreName": { + "value": "[parameters('datastoreName')]" + }, + "publicNetworkAccess": { + "value": "[parameters('storagePublicNetworkAccess')]" + }, + "subnetIds": { + "value": "[concat(createArray(format('{0}/subnets/{1}', reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value, parameters('subnetName'))), parameters('allowedSubnetIds'))]" + }, + "tags": { + "value": "[parameters('tags')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "8073065165220131475" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "storageName": { + "type": "string", + "metadata": { + "description": "Name of the storage account" + } + }, + "storageRegion": { + "type": "string", + "metadata": { + "description": "Azure region of the storage to create" + } + }, + "storageSKU": { + "type": "string", + "defaultValue": "Standard_LRS", + "metadata": { + "description": "Storage SKU" + }, + "allowedValues": [ + "Standard_LRS", + "Standard_ZRS", + "Standard_GRS", + "Standard_GZRS", + "Standard_RAGRS", + "Standard_RAGZRS", + "Premium_LRS", + "Premium_ZRS" + ] + }, + "containerName": { + "type": "string", + "defaultValue": "private", + "metadata": { + "description": "Name of the storage container resource to create for the pair" + } + }, + "datastoreName": { + "type": "string", + "defaultValue": "[replace(format('datastore_{0}', parameters('storageName')), '-', '_')]", + "metadata": { + "description": "Name of the datastore for attaching the storage to the AzureML workspace." + } + }, + "subnetIds": { + "type": "array", + "defaultValue": [], + "metadata": { + "description": "Resource ID of the subnets allowed into this storage" + } + }, + "publicNetworkAccess": { + "type": "string", + "defaultValue": "Disabled", + "metadata": { + "description": "Allow or disallow public network access to Storage Account." + }, + "allowedValues": [ + "Enabled", + "vNetOnly", + "Disabled" + ] + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to add to the resources" + } + } + }, + "variables": { + "storageNameCleaned": "[replace(parameters('storageName'), '-', '')]", + "storageAccountCleanName": "[substring(variables('storageNameCleaned'), 0, min(length(variables('storageNameCleaned')), 24))]", + "storageAllowedSubnetIds": "[if(equals(parameters('publicNetworkAccess'), 'Enabled'), createArray(), parameters('subnetIds'))]", + "storagedefaultAction": "[if(equals(parameters('publicNetworkAccess'), 'Enabled'), 'Allow', 'Deny')]", + "storagepublicNetworkAccess": "[if(equals(parameters('publicNetworkAccess'), 'Disabled'), 'Disabled', 'Enabled')]" + }, + "resources": [ + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2022-05-01", + "name": "[variables('storageAccountCleanName')]", + "location": "[parameters('storageRegion')]", + "tags": "[parameters('tags')]", + "sku": { + "name": "[parameters('storageSKU')]" + }, + "kind": "StorageV2", + "properties": { + "accessTier": "Hot", + "allowBlobPublicAccess": false, + "allowCrossTenantReplication": false, + "allowedCopyScope": "PrivateLink", + "allowSharedKeyAccess": true, + "networkAcls": { + "copy": [ + { + "name": "virtualNetworkRules", + "count": "[length(variables('storageAllowedSubnetIds'))]", + "input": { + "id": "[variables('storageAllowedSubnetIds')[copyIndex('virtualNetworkRules')]]", + "action": "Allow" + } + } + ], + "bypass": "AzureServices", + "defaultAction": "[variables('storagedefaultAction')]", + "resourceAccessRules": [] + }, + "publicNetworkAccess": "[variables('storagepublicNetworkAccess')]", + "routingPreference": { + "routingChoice": "MicrosoftRouting", + "publishMicrosoftEndpoints": true + }, + "encryption": { + "keySource": "Microsoft.Storage", + "requireInfrastructureEncryption": false, + "services": { + "blob": { + "enabled": true, + "keyType": "Account" + }, + "file": { + "enabled": true, + "keyType": "Account" + }, + "queue": { + "enabled": true, + "keyType": "Service" + }, + "table": { + "enabled": true, + "keyType": "Service" + } + } + }, + "isHnsEnabled": false, + "isNfsV3Enabled": false, + "isLocalUserEnabled": false, + "isSftpEnabled": false, + "keyPolicy": { + "keyExpirationPeriodInDays": 7 + }, + "largeFileSharesState": "Disabled", + "minimumTlsVersion": "TLS1_2", + "supportsHttpsTrafficOnly": true + } + }, + { + "type": "Microsoft.Storage/storageAccounts/blobServices/containers", + "apiVersion": "2022-05-01", + "name": "[format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName'))]", + "properties": { + "metadata": {}, + "publicAccess": "None" + }, + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountCleanName'))]" + ] + }, + { + "type": "Microsoft.MachineLearningServices/workspaces/datastores", + "apiVersion": "2022-06-01-preview", + "name": "[format('{0}/{1}', parameters('machineLearningName'), parameters('datastoreName'))]", + "properties": { + "tags": "[parameters('tags')]", + "credentials": { + "credentialsType": "None" + }, + "description": "[format('Private storage in region {0}', parameters('storageRegion'))]", + "properties": {}, + "datastoreType": "AzureBlob", + "accountName": "[variables('storageAccountCleanName')]", + "containerName": "[parameters('containerName')]", + "resourceGroup": "[resourceGroup().name]", + "subscriptionId": "[subscription().subscriptionId]" + }, + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts/blobServices/containers', split(format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName')), '/')[0], split(format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName')), '/')[1], split(format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName')), '/')[2])]", + "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountCleanName'))]" + ] + } + ], + "outputs": { + "storageId": { + "type": "string", + "value": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountCleanName'))]" + }, + "storageName": { + "type": "string", + "value": "[variables('storageAccountCleanName')]" + }, + "containerName": { + "type": "string", + "value": "[format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName'))]" + }, + "datastoreName": { + "type": "string", + "value": "[format('{0}/{1}', parameters('machineLearningName'), parameters('datastoreName'))]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName')))]" + ] + }, + { + "condition": "[equals(parameters('storagePublicNetworkAccess'), 'Disabled')]", + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-endpoint-to-insilo-storage', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "location": { + "value": "[parameters('pairRegion')]" + }, + "tags": { + "value": "[parameters('tags')]" + }, + "resourceServiceId": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageId.value]" + }, + "resourceName": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value]" + }, + "pleRootName": { + "value": "[format('ple-{0}-to-{1}-st-blob', reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value, parameters('pairBaseName'))]" + }, + "virtualNetworkId": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value]" + }, + "subnetId": { + "value": "[format('{0}/subnets/{1}', reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value, parameters('subnetName'))]" + }, + "useStaticIPAddress": { + "value": "[parameters('useStorageStaticIP')]" + }, + "privateIPAddress": { + "value": "[parameters('storagePLEStaticIP')]" + }, + "privateDNSZoneName": { + "value": "[parameters('blobPrivateDNSZoneName')]" + }, + "privateDNSZoneLocation": { + "value": "[parameters('blobPrivateDNSZoneLocation')]" + }, + "groupId": { + "value": "blob" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "13553466402936175088" + } + }, + "parameters": { + "location": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Azure region of the deployment" + } + }, + "resourceServiceId": { + "type": "string", + "metadata": { + "description": "Service ID of the resource to create private link endpoint to" + } + }, + "resourceName": { + "type": "string", + "metadata": { + "description": "Name of resource in private DNS zone A record (if privateIPAddress is specified)" + } + }, + "pleRootName": { + "type": "string", + "defaultValue": "[format('ple-{0}', parameters('resourceName'))]", + "metadata": { + "description": "Name of the storage blob private link endpoint" + } + }, + "virtualNetworkId": { + "type": "string", + "metadata": { + "description": "Resource ID of the vnet" + } + }, + "subnetId": { + "type": "string", + "metadata": { + "description": "Resource ID of the subnet" + } + }, + "useStaticIPAddress": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "use privateIPAddress to assign a specific static IP address to PLE" + } + }, + "privateIPAddress": { + "type": "string", + "defaultValue": "", + "metadata": { + "description": "Specify the private IP address on the subnet." + } + }, + "privateDNSZoneName": { + "type": "string", + "metadata": { + "description": "Name of the existing DNS zone to add the PLE to" + } + }, + "privateDNSZoneLocation": { + "type": "string", + "defaultValue": "global", + "metadata": { + "description": "Location of the existing DNS zone to add the PLE to" + } + }, + "groupId": { + "type": "string", + "metadata": { + "description": "Name of the DNS zone group to add to the PLE" + } + }, + "linkVirtualNetwork": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Creates the virtual network link or not (use false if link already exists)." + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to add to the resources" + } + } + }, + "variables": { + "ipConfigurationsDefinition": "[if(parameters('useStaticIPAddress'), createArray(createObject('name', format('{0}-ipconfig', parameters('pleRootName')), 'properties', createObject('groupId', parameters('groupId'), 'memberName', parameters('groupId'), 'privateIPAddress', parameters('privateIPAddress')))), createArray())]" + }, + "resources": [ + { + "type": "Microsoft.Network/privateEndpoints", + "apiVersion": "2022-01-01", + "name": "[parameters('pleRootName')]", + "location": "[parameters('location')]", + "tags": "[parameters('tags')]", + "properties": { + "ipConfigurations": "[variables('ipConfigurationsDefinition')]", + "privateLinkServiceConnections": [ + { + "name": "[parameters('pleRootName')]", + "properties": { + "groupIds": [ + "[parameters('groupId')]" + ], + "privateLinkServiceId": "[parameters('resourceServiceId')]", + "privateLinkServiceConnectionState": { + "status": "Approved", + "description": "Auto-Approved", + "actionsRequired": "None" + } + } + } + ], + "subnet": { + "id": "[parameters('subnetId')]" + } + } + }, + { + "type": "Microsoft.Network/privateEndpoints/privateDnsZoneGroups", + "apiVersion": "2020-06-01", + "name": "[format('{0}/{1}', parameters('pleRootName'), format('{0}-PrivateDnsZoneGroup', parameters('groupId')))]", + "properties": { + "privateDnsZoneConfigs": [ + { + "name": "[parameters('privateDNSZoneName')]", + "properties": { + "privateDnsZoneId": "[resourceId('Microsoft.Network/privateDnsZones', parameters('privateDNSZoneName'))]" + } + } + ] + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/privateEndpoints', parameters('pleRootName'))]" + ] + }, + { + "condition": "[parameters('linkVirtualNetwork')]", + "type": "Microsoft.Network/privateDnsZones/virtualNetworkLinks", + "apiVersion": "2020-06-01", + "name": "[format('{0}/{1}', parameters('privateDNSZoneName'), uniqueString(parameters('subnetId'), parameters('resourceServiceId'), parameters('groupId')))]", + "location": "[parameters('privateDNSZoneLocation')]", + "properties": { + "registrationEnabled": false, + "virtualNetwork": { + "id": "[parameters('virtualNetworkId')]" + } + } + } + ], + "outputs": { + "name": { + "type": "string", + "value": "[parameters('pleRootName')]" + }, + "id": { + "type": "string", + "value": "[resourceId('Microsoft.Network/privateEndpoints', parameters('pleRootName'))]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName')))]", + "[resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName')))]" + ] + }, + { + "condition": "[parameters('applyDefaultPermissions')]", + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-internal-rw-perms', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "storageAccountName": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value]" + }, + "identityPrincipalId": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-aml-compute', parameters('pairBaseName'))), '2020-10-01').outputs.identityPrincipalId.value]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "7765934047439622108" + } + }, + "parameters": { + "storageAccountName": { + "type": "string", + "metadata": { + "description": "Full path to storage" + } + }, + "identityPrincipalId": { + "type": "string", + "metadata": { + "description": "PrincipalId of the managed identity" + } + }, + "computeToStorageRoles": { + "type": "array", + "defaultValue": [ + "ba92f5b4-2d11-453d-a403-e96b0029c9fe", + "81a9662b-bebf-436f-a333-f67b29880f12", + "c12c1c16-33a1-487b-954d-41c89c60f349" + ], + "metadata": { + "description": "Role definition IDs for the compute towards the internal storage" + } + } + }, + "resources": [ + { + "copy": { + "name": "roleAssignments", + "count": "[length(parameters('computeToStorageRoles'))]" + }, + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2022-04-01", + "scope": "[format('Microsoft.Storage/storageAccounts/{0}', parameters('storageAccountName'))]", + "name": "[guid(resourceGroup().id, resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), parameters('identityPrincipalId'), parameters('computeToStorageRoles')[copyIndex()])]", + "properties": { + "roleDefinitionId": "[format('/subscriptions/{0}/providers/Microsoft.Authorization/roleDefinitions/{1}', subscription().subscriptionId, parameters('computeToStorageRoles')[copyIndex()])]", + "principalId": "[parameters('identityPrincipalId')]", + "principalType": "ServicePrincipal" + } + } + ] + } + }, + "dependsOn": [ + "[resourceId('Microsoft.Resources/deployments', format('{0}-vnet-aml-compute', parameters('pairBaseName')))]", + "[resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName')))]" + ] + } + ], + "outputs": { + "identityPrincipalId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-aml-compute', parameters('pairBaseName'))), '2020-10-01').outputs.identityPrincipalId.value]" + }, + "storageName": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value]" + }, + "storageServiceId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageId.value]" + }, + "computeName": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-aml-compute', parameters('pairBaseName'))), '2020-10-01').outputs.compute.value]" + }, + "region": { + "type": "string", + "value": "[parameters('pairRegion')]" + }, + "vnetName": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.name.value]" + }, + "vnetId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value]" + }, + "subnetId": { + "type": "string", + "value": "[format('{0}/subnets/{1}', reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value, parameters('subnetName'))]" + } + } +} \ No newline at end of file diff --git a/mlops/arm/vnet_aks_with_confcomp_storage_pair.json b/mlops/arm/vnet_aks_with_confcomp_storage_pair.json new file mode 100644 index 00000000..87129cca --- /dev/null +++ b/mlops/arm/vnet_aks_with_confcomp_storage_pair.json @@ -0,0 +1,1649 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "7795565545239978732" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "pairRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Specifies the location of the pair resources." + } + }, + "pairBaseName": { + "type": "string", + "metadata": { + "description": "Base name used for creating all pair resources." + } + }, + "storageAccountName": { + "type": "string", + "defaultValue": "[replace(format('st{0}', parameters('pairBaseName')), '-', '')]", + "metadata": { + "description": "Name of the storage account resource to create for the pair" + } + }, + "datastoreName": { + "type": "string", + "defaultValue": "[replace(format('datastore_{0}', parameters('pairBaseName')), '-', '_')]", + "metadata": { + "description": "Name of the datastore for attaching the storage to the AzureML workspace." + } + }, + "computeName": { + "type": "string", + "defaultValue": "[format('{0}-01', parameters('pairBaseName'))]", + "metadata": { + "description": "Name of the default compute cluster for the pair" + } + }, + "computeSKU": { + "type": "string", + "defaultValue": "Standard_DC2as_v5", + "metadata": { + "description": "VM size for the compute cluster" + } + }, + "computeNodes": { + "type": "int", + "defaultValue": 4, + "metadata": { + "description": "VM nodes for the default compute cluster" + } + }, + "uaiName": { + "type": "string", + "defaultValue": "[format('uai-{0}', parameters('pairBaseName'))]", + "metadata": { + "description": "Name of the UAI for the pair compute cluster" + } + }, + "nsgResourceName": { + "type": "string", + "defaultValue": "[format('nsg-{0}', parameters('pairBaseName'))]", + "metadata": { + "description": "Name of the Network Security Group resource" + } + }, + "vnetResourceName": { + "type": "string", + "defaultValue": "[format('vnet-{0}', parameters('pairBaseName'))]", + "metadata": { + "description": "Name of the vNET resource" + } + }, + "vnetAddressPrefix": { + "type": "string", + "metadata": { + "description": "Virtual network address prefix" + } + }, + "subnetPrefix": { + "type": "string", + "metadata": { + "description": "Subnet address prefix" + } + }, + "useStorageStaticIP": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Use a static ip for storage PLE" + } + }, + "storagePLEStaticIP": { + "type": "string", + "defaultValue": "172.19.0.50", + "metadata": { + "description": "Which static IP to use for storage PLE (if useStorageStaticIP is true)" + } + }, + "subnetName": { + "type": "string", + "defaultValue": "snet-training", + "metadata": { + "description": "Subnet name" + } + }, + "allowedSubnetIds": { + "type": "array", + "defaultValue": [], + "metadata": { + "description": "Allow other subnets into the storage (need to be in the same region)" + } + }, + "enableNodePublicIp": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Enable compute node public IP" + } + }, + "storagePublicNetworkAccess": { + "type": "string", + "defaultValue": "Disabled", + "metadata": { + "description": "Allow or disallow public network access to Storage Account." + }, + "allowedValues": [ + "Enabled", + "vNetOnly", + "Disabled" + ] + }, + "applyDefaultPermissions": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Allow compute cluster to access storage account with R/W permissions (using UAI)" + } + }, + "blobPrivateDNSZoneName": { + "type": "string", + "defaultValue": "[format('privatelink.blob.{0}', environment().suffixes.storage)]", + "metadata": { + "description": "Name of the private DNS zone for blob" + } + }, + "blobPrivateDNSZoneLocation": { + "type": "string", + "defaultValue": "global", + "metadata": { + "description": "Location of the private DNS zone for blob" + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to curate the resources in Azure." + } + } + }, + "resources": [ + { + "type": "Microsoft.ManagedIdentity/userAssignedIdentities", + "apiVersion": "2022-01-31-preview", + "name": "[parameters('uaiName')]", + "location": "[parameters('pairRegion')]", + "tags": "[parameters('tags')]" + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-deployment', parameters('nsgResourceName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "location": { + "value": "[parameters('pairRegion')]" + }, + "nsgName": { + "value": "[parameters('nsgResourceName')]" + }, + "tags": { + "value": "[parameters('tags')]" + }, + "workspaceRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "enableNodePublicIp": { + "value": "[parameters('enableNodePublicIp')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "17553148586584182485" + } + }, + "parameters": { + "location": { + "type": "string", + "metadata": { + "description": "Azure region of the deployment" + } + }, + "workspaceRegion": { + "type": "string", + "metadata": { + "description": "Region of the AzureML workspace" + } + }, + "tags": { + "type": "object", + "metadata": { + "description": "Tags to add to the resources" + } + }, + "nsgName": { + "type": "string", + "metadata": { + "description": "Name of the network security group" + } + }, + "enableNodePublicIp": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Set rules to allow for compute with public IP" + } + } + }, + "resources": [ + { + "type": "Microsoft.Network/networkSecurityGroups", + "apiVersion": "2022-07-01", + "name": "[parameters('nsgName')]", + "location": "[parameters('location')]", + "tags": "[parameters('tags')]" + }, + { + "condition": "[parameters('enableNodePublicIp')]", + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'AzureMLPublicIPInbound')]", + "properties": { + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRange": "44224", + "sourceAddressPrefix": "AzureMachineLearning", + "destinationAddressPrefix": "*", + "access": "Allow", + "priority": 130, + "direction": "Inbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'AzureMLOutboundTcp')]", + "properties": { + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRanges": [ + "443", + "8787", + "18881" + ], + "sourceAddressPrefix": "*", + "destinationAddressPrefix": "AzureMachineLearning", + "access": "Allow", + "priority": 150, + "direction": "Outbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'AzureMLOutboundUdp')]", + "properties": { + "protocol": "Udp", + "sourcePortRange": "*", + "destinationPortRange": "5831", + "sourceAddressPrefix": "*", + "destinationAddressPrefix": "AzureMachineLearning", + "access": "Allow", + "priority": 151, + "direction": "Outbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'BatchNodeManagementOutbound')]", + "properties": { + "protocol": "*", + "sourcePortRange": "*", + "destinationPortRange": "443", + "sourceAddressPrefix": "*", + "destinationAddressPrefix": "[format('BatchNodeManagement.{0}', parameters('workspaceRegion'))]", + "access": "Allow", + "priority": 152, + "direction": "Outbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + }, + { + "type": "Microsoft.Network/networkSecurityGroups/securityRules", + "apiVersion": "2022-07-01", + "name": "[format('{0}/{1}', parameters('nsgName'), 'AzureStorageAccount')]", + "properties": { + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRange": "443", + "sourceAddressPrefix": "*", + "destinationAddressPrefix": "[format('Storage.{0}', parameters('workspaceRegion'))]", + "access": "Allow", + "priority": 143, + "direction": "Outbound" + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + ] + } + ], + "outputs": { + "id": { + "type": "string", + "value": "[resourceId('Microsoft.Network/networkSecurityGroups', parameters('nsgName'))]" + } + } + } + } + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-deployment', parameters('vnetResourceName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "location": { + "value": "[parameters('pairRegion')]" + }, + "virtualNetworkName": { + "value": "[parameters('vnetResourceName')]" + }, + "networkSecurityGroupId": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('nsgResourceName'))), '2020-10-01').outputs.id.value]" + }, + "vnetAddressPrefix": { + "value": "[parameters('vnetAddressPrefix')]" + }, + "subnets": { + "value": [ + { + "name": "[parameters('subnetName')]", + "addressPrefix": "[parameters('subnetPrefix')]" + } + ] + }, + "tags": { + "value": "[parameters('tags')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "6565408118333802825" + } + }, + "parameters": { + "location": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Azure region of the deployment" + } + }, + "virtualNetworkName": { + "type": "string", + "metadata": { + "description": "Name of the virtual network resource" + } + }, + "networkSecurityGroupId": { + "type": "string", + "metadata": { + "description": "Group ID of the network security group" + } + }, + "vnetAddressPrefix": { + "type": "string", + "defaultValue": "192.168.0.0/16", + "metadata": { + "description": "Virtual network address prefix" + } + }, + "subnets": { + "type": "array", + "defaultValue": [ + { + "name": "snet-training", + "addressPrefix": "192.168.0.0/24" + } + ], + "metadata": { + "description": "Training subnets names and address prefix" + } + }, + "serviceEndpoints": { + "type": "array", + "defaultValue": [ + "Microsoft.KeyVault", + "Microsoft.ContainerRegistry", + "Microsoft.Storage" + ], + "metadata": { + "description": "List of service endpoints expected on this vnet" + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to add to the resources" + } + } + }, + "variables": { + "copy": [ + { + "name": "serviceEndpointsDefinition", + "count": "[length(parameters('serviceEndpoints'))]", + "input": { + "service": "[parameters('serviceEndpoints')[copyIndex('serviceEndpointsDefinition')]]" + } + }, + { + "name": "subnetsDefinition", + "count": "[length(parameters('subnets'))]", + "input": { + "name": "[parameters('subnets')[copyIndex('subnetsDefinition')].name]", + "properties": { + "addressPrefix": "[parameters('subnets')[copyIndex('subnetsDefinition')].addressPrefix]", + "privateEndpointNetworkPolicies": "Disabled", + "privateLinkServiceNetworkPolicies": "Disabled", + "serviceEndpoints": "[variables('serviceEndpointsDefinition')]", + "networkSecurityGroup": { + "id": "[parameters('networkSecurityGroupId')]" + } + } + } + } + ] + }, + "resources": [ + { + "type": "Microsoft.Network/virtualNetworks", + "apiVersion": "2022-01-01", + "name": "[parameters('virtualNetworkName')]", + "location": "[parameters('location')]", + "tags": "[parameters('tags')]", + "properties": { + "addressSpace": { + "addressPrefixes": [ + "[parameters('vnetAddressPrefix')]" + ] + }, + "subnets": "[variables('subnetsDefinition')]" + } + } + ], + "outputs": { + "id": { + "type": "string", + "value": "[resourceId('Microsoft.Network/virtualNetworks', parameters('virtualNetworkName'))]" + }, + "name": { + "type": "string", + "value": "[parameters('virtualNetworkName')]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('nsgResourceName')))]" + ] + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-vnet-aml-compute', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "machineLearningName": { + "value": "[parameters('machineLearningName')]" + }, + "machineLearningRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "computeName": { + "value": "[parameters('computeName')]" + }, + "computeRegion": { + "value": "[parameters('pairRegion')]" + }, + "agentVMSize": { + "value": "[parameters('computeSKU')]" + }, + "agentCount": { + "value": "[parameters('computeNodes')]" + }, + "computeUaiName": { + "value": "[parameters('uaiName')]" + }, + "subnetName": { + "value": "[parameters('subnetName')]" + }, + "subnetId": { + "value": "[format('{0}/subnets/{1}', reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value, parameters('subnetName'))]" + }, + "tags": { + "value": "[parameters('tags')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "17140459328423898085" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "computeName": { + "type": "string", + "metadata": { + "description": "The name of the Managed Cluster resource." + } + }, + "computeRegion": { + "type": "string", + "metadata": { + "description": "Specifies the location of the compute resources." + } + }, + "dnsPrefix": { + "type": "string", + "defaultValue": "[replace(format('dnxprefix-{0}', parameters('computeName')), '-', '')]", + "maxLength": 54, + "metadata": { + "description": "Optional DNS prefix to use with hosted Kubernetes API server FQDN." + } + }, + "agentVMSize": { + "type": "string", + "defaultValue": "Standard_DC2as_v5", + "metadata": { + "description": "The size of the Virtual Machine." + } + }, + "agentCount": { + "type": "int", + "defaultValue": 2, + "maxValue": 50, + "minValue": 1, + "metadata": { + "description": "The number of nodes for the cluster pool." + } + }, + "osDiskSizeGB": { + "type": "int", + "defaultValue": 0, + "maxValue": 1023, + "minValue": 0, + "metadata": { + "description": "Disk size (in GB) to provision for each of the agent pool nodes. This value ranges from 0 to 1023. Specifying 0 will apply the default disk size for that agentVMSize." + } + }, + "computeUaiName": { + "type": "string", + "metadata": { + "description": "Name of the UAI for the compute cluster" + } + }, + "subnetId": { + "type": "string", + "metadata": { + "description": "Subnet ID" + } + }, + "subnetName": { + "type": "string", + "defaultValue": "snet-training", + "metadata": { + "description": "Subnet name" + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to curate the resources in Azure." + } + } + }, + "variables": { + "userAssignedIdentities": { + "[format('/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{2}', subscription().subscriptionId, resourceGroup().name, parameters('computeUaiName'))]": {} + } + }, + "resources": [ + { + "type": "Microsoft.ContainerService/managedClusters", + "apiVersion": "2022-05-02-preview", + "name": "[parameters('computeName')]", + "location": "[parameters('computeRegion')]", + "tags": "[parameters('tags')]", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": "[variables('userAssignedIdentities')]" + }, + "properties": { + "dnsPrefix": "[parameters('dnsPrefix')]", + "addonProfiles": { + "ACCSGXDevicePlugin": { + "enabled": true, + "config": { + "ACCSGXQuoteHelperEnabled": "false" + } + } + }, + "agentPoolProfiles": [ + { + "name": "confcompool", + "count": "[parameters('agentCount')]", + "vmSize": "[parameters('agentVMSize')]", + "osType": "Linux", + "mode": "System", + "osDiskSizeGB": "[parameters('osDiskSizeGB')]", + "vnetSubnetID": "[parameters('subnetId')]" + } + ], + "apiServerAccessProfile": { + "authorizedIPRanges": [], + "enablePrivateCluster": false, + "enablePrivateClusterPublicFQDN": false, + "enableVnetIntegration": false + }, + "networkProfile": { + "networkPlugin": "azure" + } + } + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('deploy-aml-extension-{0}', parameters('computeName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "clusterName": { + "value": "[parameters('computeName')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "5119649280313860161" + } + }, + "parameters": { + "clusterName": { + "type": "string", + "metadata": { + "description": "Name of the AKS cluster in the resource group." + } + }, + "clusterAdminUAIName": { + "type": "string", + "defaultValue": "[format('uai-admin-{0}', parameters('clusterName'))]", + "metadata": { + "description": "DeploymentScript needs a UAI with permissions to install extension on AKS." + } + }, + "createNewAdminIdentity": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Create or reuse an existing UAI (see clusterAdminUAIName)." + } + }, + "extensionDeploymentName": { + "type": "string", + "defaultValue": "azmlext" + }, + "enableTraining": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Must be set to True for AzureML extension deployment with Machine Learning model training and batch scoring support." + } + }, + "enableInference": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Must be set to True for AzureML extension deployment with Machine Learning inference support." + } + }, + "inferenceRouterServiceType": { + "type": "string", + "defaultValue": "loadBalancer", + "allowedValues": [ + "loadBalancer", + "nodePort", + "clusterIP" + ], + "metadata": { + "description": "Required if enableInference=True." + } + }, + "allowInsecureConnections": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Can be set to True to use inference HTTP endpoints for development or test purposes." + } + }, + "internalLoadBalancerProvider": { + "type": "string", + "defaultValue": "azure", + "metadata": { + "description": "Set to azure to allow the inference router using internal load balancer. This config is only applicable for Azure Kubernetes Service(AKS) cluster now." + } + }, + "inferenceLoadBalancerHA": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "To ensure high availability of azureml-fe routing service (for clusters with 3 nodes or more)." + } + }, + "installNvidiaDevicePlugin": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "To enable ML workloads on NVIDIA GPU hardware." + } + }, + "installPromOp": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "AzureML extension needs prometheus operator to manage prometheus. Set to False to reuse the existing prometheus operator." + } + }, + "installVolcano": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "AzureML extension needs volcano scheduler to schedule the job. Set to False to reuse existing volcano scheduler." + } + }, + "installDcgmExporter": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "Dcgm-exporter can expose GPU metrics for AzureML workloads, which can be monitored in Azure portal. Set installDcgmExporter to True to install dcgm-exporter." + } + } + }, + "variables": { + "aksExtensionInstallRoleId": "b24988ac-6180-42a0-ab88-20f7382dd24c" + }, + "resources": [ + { + "condition": "[parameters('createNewAdminIdentity')]", + "type": "Microsoft.ManagedIdentity/userAssignedIdentities", + "apiVersion": "2022-01-31-preview", + "name": "[parameters('clusterAdminUAIName')]", + "location": "[resourceGroup().location]" + }, + { + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2022-04-01", + "scope": "[format('Microsoft.ContainerService/managedClusters/{0}', parameters('clusterName'))]", + "name": "[guid(resourceGroup().id, resourceId('Microsoft.ContainerService/managedClusters', parameters('clusterName')), variables('aksExtensionInstallRoleId'))]", + "properties": { + "roleDefinitionId": "[format('/subscriptions/{0}/providers/Microsoft.Authorization/roleDefinitions/{1}', subscription().subscriptionId, variables('aksExtensionInstallRoleId'))]", + "principalId": "[if(parameters('createNewAdminIdentity'), reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName')), '2022-01-31-preview').principalId, reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName')), '2022-01-31-preview').principalId)]", + "principalType": "ServicePrincipal" + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName'))]" + ] + }, + { + "type": "Microsoft.Resources/deploymentScripts", + "apiVersion": "2020-10-01", + "name": "[format('deploy-aks-azureml-extensions-to-{0}', parameters('clusterName'))]", + "location": "[resourceGroup().location]", + "kind": "AzureCLI", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": { + "[format('/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{2}', subscription().subscriptionId, resourceGroup().name, parameters('clusterAdminUAIName'))]": {} + } + }, + "properties": { + "azCliVersion": "2.40.0", + "cleanupPreference": "OnSuccess", + "retentionInterval": "P1D", + "scriptContent": "[format('az extension add --name k8s-extension; az k8s-extension create --name {0} --extension-type Microsoft.AzureML.Kubernetes --config enableTraining={1} enableInference={2} inferenceRouterServiceType={3} allowInsecureConnections={4} internalLoadBalancerProvider={5} inferenceLoadBalancerHA={6} installNvidiaDevicePlugin={7} installPromOp={8} installVolcano={9} installDcgmExporter={10} --cluster-type managedClusters --cluster-name {11} --scope cluster --resource-group {12}', parameters('extensionDeploymentName'), parameters('enableTraining'), parameters('enableInference'), parameters('inferenceRouterServiceType'), parameters('allowInsecureConnections'), parameters('internalLoadBalancerProvider'), parameters('inferenceLoadBalancerHA'), parameters('installNvidiaDevicePlugin'), parameters('installPromOp'), parameters('installVolcano'), parameters('installDcgmExporter'), parameters('clusterName'), resourceGroup().name)]", + "timeout": "P1D" + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('clusterAdminUAIName'))]" + ] + } + ] + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters', parameters('computeName'))]" + ] + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('attach-{0}-to-aml-{1}', parameters('computeName'), parameters('machineLearningName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "machineLearningName": { + "value": "[parameters('machineLearningName')]" + }, + "machineLearningRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "aksResourceId": { + "value": "[resourceId('Microsoft.ContainerService/managedClusters', parameters('computeName'))]" + }, + "aksRegion": { + "value": "[reference(resourceId('Microsoft.ContainerService/managedClusters', parameters('computeName')), '2022-05-02-preview', 'full').location]" + }, + "amlComputeName": { + "value": "[parameters('computeName')]" + }, + "computeUaiName": { + "value": "[parameters('computeUaiName')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "13309880359552726590" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "aksResourceId": { + "type": "string", + "metadata": { + "description": "Resource ID of the AKS cluster." + } + }, + "aksRegion": { + "type": "string", + "metadata": { + "description": "Region of the AKS cluster." + } + }, + "amlComputeName": { + "type": "string", + "metadata": { + "description": "How to name this compute in Azure ML" + } + }, + "computeUaiName": { + "type": "string", + "metadata": { + "description": "Name of the existing UAI for the compute cluster." + } + } + }, + "variables": { + "userAssignedIdentities": { + "[format('/subscriptions/{0}/resourceGroups/{1}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/{2}', subscription().subscriptionId, resourceGroup().name, parameters('computeUaiName'))]": {} + } + }, + "resources": [ + { + "type": "Microsoft.MachineLearningServices/workspaces/computes", + "apiVersion": "2021-01-01", + "name": "[format('{0}/{1}', parameters('machineLearningName'), parameters('amlComputeName'))]", + "location": "[parameters('machineLearningRegion')]", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": "[variables('userAssignedIdentities')]" + }, + "properties": { + "computeType": "Kubernetes", + "computeLocation": "[parameters('aksRegion')]", + "resourceId": "[parameters('aksResourceId')]", + "description": "AKS cluster attached to AzureML workspace", + "properties": {} + } + } + ], + "outputs": { + "identityPrincipalId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('computeUaiName')), '2022-01-31-preview').principalId]" + }, + "compute": { + "type": "string", + "value": "[parameters('amlComputeName')]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ContainerService/managedClusters', parameters('computeName'))]", + "[resourceId('Microsoft.Resources/deployments', format('deploy-aml-extension-{0}', parameters('computeName')))]" + ] + } + ], + "outputs": { + "identityPrincipalId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('computeUaiName')), '2022-01-31-preview').principalId]" + }, + "compute": { + "type": "string", + "value": "[parameters('computeName')]" + }, + "region": { + "type": "string", + "value": "[parameters('computeRegion')]" + }, + "subnetName": { + "type": "string", + "value": "[parameters('subnetName')]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.ManagedIdentity/userAssignedIdentities', parameters('uaiName'))]", + "[resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName')))]" + ] + }, + { + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-vnet-storage', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "machineLearningName": { + "value": "[parameters('machineLearningName')]" + }, + "machineLearningRegion": { + "value": "[parameters('machineLearningRegion')]" + }, + "storageName": { + "value": "[parameters('storageAccountName')]" + }, + "storageRegion": { + "value": "[parameters('pairRegion')]" + }, + "datastoreName": { + "value": "[parameters('datastoreName')]" + }, + "publicNetworkAccess": { + "value": "[parameters('storagePublicNetworkAccess')]" + }, + "subnetIds": { + "value": "[concat(createArray(format('{0}/subnets/{1}', reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value, parameters('subnetName'))), parameters('allowedSubnetIds'))]" + }, + "tags": { + "value": "[parameters('tags')]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "8073065165220131475" + } + }, + "parameters": { + "machineLearningName": { + "type": "string", + "metadata": { + "description": "Name of AzureML workspace to attach compute+storage to." + } + }, + "machineLearningRegion": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "The region of the machine learning workspace" + } + }, + "storageName": { + "type": "string", + "metadata": { + "description": "Name of the storage account" + } + }, + "storageRegion": { + "type": "string", + "metadata": { + "description": "Azure region of the storage to create" + } + }, + "storageSKU": { + "type": "string", + "defaultValue": "Standard_LRS", + "metadata": { + "description": "Storage SKU" + }, + "allowedValues": [ + "Standard_LRS", + "Standard_ZRS", + "Standard_GRS", + "Standard_GZRS", + "Standard_RAGRS", + "Standard_RAGZRS", + "Premium_LRS", + "Premium_ZRS" + ] + }, + "containerName": { + "type": "string", + "defaultValue": "private", + "metadata": { + "description": "Name of the storage container resource to create for the pair" + } + }, + "datastoreName": { + "type": "string", + "defaultValue": "[replace(format('datastore_{0}', parameters('storageName')), '-', '_')]", + "metadata": { + "description": "Name of the datastore for attaching the storage to the AzureML workspace." + } + }, + "subnetIds": { + "type": "array", + "defaultValue": [], + "metadata": { + "description": "Resource ID of the subnets allowed into this storage" + } + }, + "publicNetworkAccess": { + "type": "string", + "defaultValue": "Disabled", + "metadata": { + "description": "Allow or disallow public network access to Storage Account." + }, + "allowedValues": [ + "Enabled", + "vNetOnly", + "Disabled" + ] + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to add to the resources" + } + } + }, + "variables": { + "storageNameCleaned": "[replace(parameters('storageName'), '-', '')]", + "storageAccountCleanName": "[substring(variables('storageNameCleaned'), 0, min(length(variables('storageNameCleaned')), 24))]", + "storageAllowedSubnetIds": "[if(equals(parameters('publicNetworkAccess'), 'Enabled'), createArray(), parameters('subnetIds'))]", + "storagedefaultAction": "[if(equals(parameters('publicNetworkAccess'), 'Enabled'), 'Allow', 'Deny')]", + "storagepublicNetworkAccess": "[if(equals(parameters('publicNetworkAccess'), 'Disabled'), 'Disabled', 'Enabled')]" + }, + "resources": [ + { + "type": "Microsoft.Storage/storageAccounts", + "apiVersion": "2022-05-01", + "name": "[variables('storageAccountCleanName')]", + "location": "[parameters('storageRegion')]", + "tags": "[parameters('tags')]", + "sku": { + "name": "[parameters('storageSKU')]" + }, + "kind": "StorageV2", + "properties": { + "accessTier": "Hot", + "allowBlobPublicAccess": false, + "allowCrossTenantReplication": false, + "allowedCopyScope": "PrivateLink", + "allowSharedKeyAccess": true, + "networkAcls": { + "copy": [ + { + "name": "virtualNetworkRules", + "count": "[length(variables('storageAllowedSubnetIds'))]", + "input": { + "id": "[variables('storageAllowedSubnetIds')[copyIndex('virtualNetworkRules')]]", + "action": "Allow" + } + } + ], + "bypass": "AzureServices", + "defaultAction": "[variables('storagedefaultAction')]", + "resourceAccessRules": [] + }, + "publicNetworkAccess": "[variables('storagepublicNetworkAccess')]", + "routingPreference": { + "routingChoice": "MicrosoftRouting", + "publishMicrosoftEndpoints": true + }, + "encryption": { + "keySource": "Microsoft.Storage", + "requireInfrastructureEncryption": false, + "services": { + "blob": { + "enabled": true, + "keyType": "Account" + }, + "file": { + "enabled": true, + "keyType": "Account" + }, + "queue": { + "enabled": true, + "keyType": "Service" + }, + "table": { + "enabled": true, + "keyType": "Service" + } + } + }, + "isHnsEnabled": false, + "isNfsV3Enabled": false, + "isLocalUserEnabled": false, + "isSftpEnabled": false, + "keyPolicy": { + "keyExpirationPeriodInDays": 7 + }, + "largeFileSharesState": "Disabled", + "minimumTlsVersion": "TLS1_2", + "supportsHttpsTrafficOnly": true + } + }, + { + "type": "Microsoft.Storage/storageAccounts/blobServices/containers", + "apiVersion": "2022-05-01", + "name": "[format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName'))]", + "properties": { + "metadata": {}, + "publicAccess": "None" + }, + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountCleanName'))]" + ] + }, + { + "type": "Microsoft.MachineLearningServices/workspaces/datastores", + "apiVersion": "2022-06-01-preview", + "name": "[format('{0}/{1}', parameters('machineLearningName'), parameters('datastoreName'))]", + "properties": { + "tags": "[parameters('tags')]", + "credentials": { + "credentialsType": "None" + }, + "description": "[format('Private storage in region {0}', parameters('storageRegion'))]", + "properties": {}, + "datastoreType": "AzureBlob", + "accountName": "[variables('storageAccountCleanName')]", + "containerName": "[parameters('containerName')]", + "resourceGroup": "[resourceGroup().name]", + "subscriptionId": "[subscription().subscriptionId]" + }, + "dependsOn": [ + "[resourceId('Microsoft.Storage/storageAccounts/blobServices/containers', split(format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName')), '/')[0], split(format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName')), '/')[1], split(format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName')), '/')[2])]", + "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountCleanName'))]" + ] + } + ], + "outputs": { + "storageId": { + "type": "string", + "value": "[resourceId('Microsoft.Storage/storageAccounts', variables('storageAccountCleanName'))]" + }, + "storageName": { + "type": "string", + "value": "[variables('storageAccountCleanName')]" + }, + "containerName": { + "type": "string", + "value": "[format('{0}/default/{1}', variables('storageAccountCleanName'), parameters('containerName'))]" + }, + "datastoreName": { + "type": "string", + "value": "[format('{0}/{1}', parameters('machineLearningName'), parameters('datastoreName'))]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName')))]" + ] + }, + { + "condition": "[equals(parameters('storagePublicNetworkAccess'), 'Disabled')]", + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-endpoint-to-insilo-storage', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "location": { + "value": "[parameters('pairRegion')]" + }, + "tags": { + "value": "[parameters('tags')]" + }, + "resourceServiceId": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageId.value]" + }, + "resourceName": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value]" + }, + "pleRootName": { + "value": "[format('ple-{0}-to-{1}-st-blob', reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value, parameters('pairBaseName'))]" + }, + "virtualNetworkId": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value]" + }, + "subnetId": { + "value": "[format('{0}/subnets/{1}', reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value, parameters('subnetName'))]" + }, + "useStaticIPAddress": { + "value": "[parameters('useStorageStaticIP')]" + }, + "privateIPAddress": { + "value": "[parameters('storagePLEStaticIP')]" + }, + "privateDNSZoneName": { + "value": "[parameters('blobPrivateDNSZoneName')]" + }, + "privateDNSZoneLocation": { + "value": "[parameters('blobPrivateDNSZoneLocation')]" + }, + "groupId": { + "value": "blob" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "13553466402936175088" + } + }, + "parameters": { + "location": { + "type": "string", + "defaultValue": "[resourceGroup().location]", + "metadata": { + "description": "Azure region of the deployment" + } + }, + "resourceServiceId": { + "type": "string", + "metadata": { + "description": "Service ID of the resource to create private link endpoint to" + } + }, + "resourceName": { + "type": "string", + "metadata": { + "description": "Name of resource in private DNS zone A record (if privateIPAddress is specified)" + } + }, + "pleRootName": { + "type": "string", + "defaultValue": "[format('ple-{0}', parameters('resourceName'))]", + "metadata": { + "description": "Name of the storage blob private link endpoint" + } + }, + "virtualNetworkId": { + "type": "string", + "metadata": { + "description": "Resource ID of the vnet" + } + }, + "subnetId": { + "type": "string", + "metadata": { + "description": "Resource ID of the subnet" + } + }, + "useStaticIPAddress": { + "type": "bool", + "defaultValue": false, + "metadata": { + "description": "use privateIPAddress to assign a specific static IP address to PLE" + } + }, + "privateIPAddress": { + "type": "string", + "defaultValue": "", + "metadata": { + "description": "Specify the private IP address on the subnet." + } + }, + "privateDNSZoneName": { + "type": "string", + "metadata": { + "description": "Name of the existing DNS zone to add the PLE to" + } + }, + "privateDNSZoneLocation": { + "type": "string", + "defaultValue": "global", + "metadata": { + "description": "Location of the existing DNS zone to add the PLE to" + } + }, + "groupId": { + "type": "string", + "metadata": { + "description": "Name of the DNS zone group to add to the PLE" + } + }, + "linkVirtualNetwork": { + "type": "bool", + "defaultValue": true, + "metadata": { + "description": "Creates the virtual network link or not (use false if link already exists)." + } + }, + "tags": { + "type": "object", + "defaultValue": {}, + "metadata": { + "description": "Tags to add to the resources" + } + } + }, + "variables": { + "ipConfigurationsDefinition": "[if(parameters('useStaticIPAddress'), createArray(createObject('name', format('{0}-ipconfig', parameters('pleRootName')), 'properties', createObject('groupId', parameters('groupId'), 'memberName', parameters('groupId'), 'privateIPAddress', parameters('privateIPAddress')))), createArray())]" + }, + "resources": [ + { + "type": "Microsoft.Network/privateEndpoints", + "apiVersion": "2022-01-01", + "name": "[parameters('pleRootName')]", + "location": "[parameters('location')]", + "tags": "[parameters('tags')]", + "properties": { + "ipConfigurations": "[variables('ipConfigurationsDefinition')]", + "privateLinkServiceConnections": [ + { + "name": "[parameters('pleRootName')]", + "properties": { + "groupIds": [ + "[parameters('groupId')]" + ], + "privateLinkServiceId": "[parameters('resourceServiceId')]", + "privateLinkServiceConnectionState": { + "status": "Approved", + "description": "Auto-Approved", + "actionsRequired": "None" + } + } + } + ], + "subnet": { + "id": "[parameters('subnetId')]" + } + } + }, + { + "type": "Microsoft.Network/privateEndpoints/privateDnsZoneGroups", + "apiVersion": "2020-06-01", + "name": "[format('{0}/{1}', parameters('pleRootName'), format('{0}-PrivateDnsZoneGroup', parameters('groupId')))]", + "properties": { + "privateDnsZoneConfigs": [ + { + "name": "[parameters('privateDNSZoneName')]", + "properties": { + "privateDnsZoneId": "[resourceId('Microsoft.Network/privateDnsZones', parameters('privateDNSZoneName'))]" + } + } + ] + }, + "dependsOn": [ + "[resourceId('Microsoft.Network/privateEndpoints', parameters('pleRootName'))]" + ] + }, + { + "condition": "[parameters('linkVirtualNetwork')]", + "type": "Microsoft.Network/privateDnsZones/virtualNetworkLinks", + "apiVersion": "2020-06-01", + "name": "[format('{0}/{1}', parameters('privateDNSZoneName'), uniqueString(parameters('subnetId'), parameters('resourceServiceId'), parameters('groupId')))]", + "location": "[parameters('privateDNSZoneLocation')]", + "properties": { + "registrationEnabled": false, + "virtualNetwork": { + "id": "[parameters('virtualNetworkId')]" + } + } + } + ], + "outputs": { + "name": { + "type": "string", + "value": "[parameters('pleRootName')]" + }, + "id": { + "type": "string", + "value": "[resourceId('Microsoft.Network/privateEndpoints', parameters('pleRootName'))]" + } + } + } + }, + "dependsOn": [ + "[resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName')))]", + "[resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName')))]" + ] + }, + { + "condition": "[parameters('applyDefaultPermissions')]", + "type": "Microsoft.Resources/deployments", + "apiVersion": "2020-10-01", + "name": "[format('{0}-internal-rw-perms', parameters('pairBaseName'))]", + "properties": { + "expressionEvaluationOptions": { + "scope": "inner" + }, + "mode": "Incremental", + "parameters": { + "storageAccountName": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value]" + }, + "identityPrincipalId": { + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-aml-compute', parameters('pairBaseName'))), '2020-10-01').outputs.identityPrincipalId.value]" + } + }, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "metadata": { + "_generator": { + "name": "bicep", + "version": "0.14.85.62628", + "templateHash": "7765934047439622108" + } + }, + "parameters": { + "storageAccountName": { + "type": "string", + "metadata": { + "description": "Full path to storage" + } + }, + "identityPrincipalId": { + "type": "string", + "metadata": { + "description": "PrincipalId of the managed identity" + } + }, + "computeToStorageRoles": { + "type": "array", + "defaultValue": [ + "ba92f5b4-2d11-453d-a403-e96b0029c9fe", + "81a9662b-bebf-436f-a333-f67b29880f12", + "c12c1c16-33a1-487b-954d-41c89c60f349" + ], + "metadata": { + "description": "Role definition IDs for the compute towards the internal storage" + } + } + }, + "resources": [ + { + "copy": { + "name": "roleAssignments", + "count": "[length(parameters('computeToStorageRoles'))]" + }, + "type": "Microsoft.Authorization/roleAssignments", + "apiVersion": "2022-04-01", + "scope": "[format('Microsoft.Storage/storageAccounts/{0}', parameters('storageAccountName'))]", + "name": "[guid(resourceGroup().id, resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), parameters('identityPrincipalId'), parameters('computeToStorageRoles')[copyIndex()])]", + "properties": { + "roleDefinitionId": "[format('/subscriptions/{0}/providers/Microsoft.Authorization/roleDefinitions/{1}', subscription().subscriptionId, parameters('computeToStorageRoles')[copyIndex()])]", + "principalId": "[parameters('identityPrincipalId')]", + "principalType": "ServicePrincipal" + } + } + ] + } + }, + "dependsOn": [ + "[resourceId('Microsoft.Resources/deployments', format('{0}-vnet-aml-compute', parameters('pairBaseName')))]", + "[resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName')))]" + ] + } + ], + "outputs": { + "identityPrincipalId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-aml-compute', parameters('pairBaseName'))), '2020-10-01').outputs.identityPrincipalId.value]" + }, + "storageName": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageName.value]" + }, + "storageServiceId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-storage', parameters('pairBaseName'))), '2020-10-01').outputs.storageId.value]" + }, + "computeName": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-vnet-aml-compute', parameters('pairBaseName'))), '2020-10-01').outputs.compute.value]" + }, + "region": { + "type": "string", + "value": "[parameters('pairRegion')]" + }, + "vnetName": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.name.value]" + }, + "vnetId": { + "type": "string", + "value": "[reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value]" + }, + "subnetId": { + "type": "string", + "value": "[format('{0}/subnets/{1}', reference(resourceId('Microsoft.Resources/deployments', format('{0}-deployment', parameters('vnetResourceName'))), '2020-10-01').outputs.id.value, parameters('subnetName'))]" + } + } +} \ No newline at end of file diff --git a/mlops/bicep/modules/azureml/attach_aks_training_to_azureml.bicep b/mlops/bicep/modules/azureml/attach_aks_training_to_azureml.bicep index 8b27e159..efe90c61 100644 --- a/mlops/bicep/modules/azureml/attach_aks_training_to_azureml.bicep +++ b/mlops/bicep/modules/azureml/attach_aks_training_to_azureml.bicep @@ -23,7 +23,7 @@ param amlComputeName string @description('Name of the existing UAI for the compute cluster.') param computeUaiName string -// provision a user assigned identify for this silo +// provision a user assigned identity for this silo resource uai 'Microsoft.ManagedIdentity/userAssignedIdentities@2022-01-31-preview' existing = { name: computeUaiName scope: resourceGroup() diff --git a/mlops/bicep/modules/computes/open_new_aks.bicep b/mlops/bicep/modules/computes/open_new_aks.bicep new file mode 100644 index 00000000..16cf3eb1 --- /dev/null +++ b/mlops/bicep/modules/computes/open_new_aks.bicep @@ -0,0 +1,135 @@ +// This BICEP script will provision an AKS cluster +// attached to a given AzureML workspace, without any specific security settings. + +// IMPORTANT: This setup is intended only for demo purpose. The data is still accessible +// by the users when opening the storage accounts, and data exfiltration is easy. + +// NOTE: this can take up to 15 minutes to complete + +// resource group must be specified as scope in az cli or module call +targetScope = 'resourceGroup' + +// required parameters +@description('Name of AzureML workspace to attach compute+storage to.') +param machineLearningName string + +@description('The region of the machine learning workspace') +param machineLearningRegion string = resourceGroup().location + +@description('The name of the Managed Cluster resource.') +param aksClusterName string + +@description('How to name this compute in Azure ML') +param amlComputeName string = aksClusterName + +@description('Specifies the location of the compute resources.') +param computeRegion string + +@description('Optional DNS prefix to use with hosted Kubernetes API server FQDN.') +@maxLength(54) +param dnsPrefix string = replace('dnxprefix-${aksClusterName}', '-', '') + +@description('The number of nodes for the cluster pool.') +@minValue(1) +@maxValue(50) +param agentCount int = 4 + + +@description('The size of the Virtual Machine.') +param agentVMSize string = 'Standard_DS3_v2' // 'Standard_DS3_v2' is for CPU; for GPU, 'Standard_NC6' would be a good default choice (don't forget to set computeIsGPU below to true if you want a GPU) + +@description('Boolean to indicate if the compute cluster should be a GPU cluster') +param computeIsGPU bool = false // change to true if you want to use a GPU + +@description('Disk size (in GB) to provision for each of the agent pool nodes. This value ranges from 0 to 1023. Specifying 0 will apply the default disk size for that agentVMSize.') +@minValue(0) +@maxValue(1023) +param osDiskSizeGB int = 0 + +@description('Name of the UAI for the compute cluster.') +param computeUaiName string = 'uai-${aksClusterName}' + +@description('Tags to curate the resources in Azure.') +param tags object = {} + + +// provision a user assigned identify for this silo +resource uai 'Microsoft.ManagedIdentity/userAssignedIdentities@2022-01-31-preview' = { + name: computeUaiName + location: computeRegion + tags: tags +} + +var identityPrincipalId = uai.properties.principalId +var userAssignedIdentities = {'/subscriptions/${subscription().subscriptionId}/resourceGroups/${resourceGroup().name}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/${uai.name}': {}} + +resource aks 'Microsoft.ContainerService/managedClusters@2022-05-02-preview' = { + name: aksClusterName + location: computeRegion + identity: { + type: 'UserAssigned' + userAssignedIdentities: userAssignedIdentities + } + properties: { + dnsPrefix: dnsPrefix + //fqdnSubdomain: 'foo' + agentPoolProfiles: [ + { + name: 'compool' + count: agentCount + // enableAutoScaling: true + // maxCount: 5 + // minCount: 2 + + vmSize: agentVMSize + osType: 'Linux' + mode: 'System' + osDiskSizeGB: osDiskSizeGB + } + ] + apiServerAccessProfile: { + // IMPORTANT: use this for demo only, it is not a private AKS cluster + authorizedIPRanges: [] + enablePrivateCluster: false + enablePrivateClusterPublicFQDN: false + enableVnetIntegration: false + } + } +} + +//module azuremlExtension '../azureml/deploy_aks_azureml_extension.bicep' = { +module azuremlExtension '../azureml/deploy_aks_azureml_extension_via_script.bicep' = { + name: 'deploy-aml-extension-${aksClusterName}' + scope: resourceGroup() + params: { + clusterName: aksClusterName + installNvidiaDevicePlugin: computeIsGPU + installDcgmExporter: computeIsGPU + } + dependsOn: [ + aks + ] +} + +module deployAttachToWorkspace '../azureml/attach_aks_training_to_azureml.bicep' = { + name: 'attach-${aksClusterName}-to-aml-${machineLearningName}' + scope: resourceGroup() + params: { + machineLearningName: machineLearningName + machineLearningRegion: machineLearningRegion + aksResourceId: aks.id + aksRegion: aks.location + amlComputeName: amlComputeName + computeUaiName: computeUaiName + } + dependsOn: [ + azuremlExtension + ] +} + +// output the compute config for next actions (permission model) +output identityPrincipalId string = identityPrincipalId +output compute string = amlComputeName +output region string = computeRegion +output aksControlPlaneFQDN string = aks.properties.fqdn +output aksId string = aks.id diff --git a/mlops/bicep/modules/computes/vnet_new_aks.bicep b/mlops/bicep/modules/computes/vnet_new_aks.bicep new file mode 100644 index 00000000..4040bd42 --- /dev/null +++ b/mlops/bicep/modules/computes/vnet_new_aks.bicep @@ -0,0 +1,137 @@ +// This BICEP script will provision an AKS cluster +// behind a vnet and subnet, attached to a workspace +// plus managed identity for permissions management. + +// resource group must be specified as scope in az cli or module call +targetScope = 'resourceGroup' + +// required parameters +@description('Name of AzureML workspace to attach compute+storage to.') +param machineLearningName string + +@description('The region of the machine learning workspace') +param machineLearningRegion string = resourceGroup().location + +@description('The name of the Managed Cluster resource.') +param computeName string + +@description('Specifies the location of the compute resources.') +param computeRegion string + +@description('Optional DNS prefix to use with hosted Kubernetes API server FQDN.') +@maxLength(54) +param dnsPrefix string = replace('dnxprefix-${computeName}', '-', '') + + +@description('The size of the Virtual Machine.') +param agentVMSize string = 'Standard_DS3_v2' // 'Standard_DS3_v2' is for CPU; for GPU, 'Standard_NC6' would be a good default choice (don't forget to set computeIsGPU below to true if you want a GPU) + +@description('Boolean to indicate if the compute cluster should be a GPU cluster') +param computeIsGPU bool = false // change to true if you want to use a GPU + +@description('The number of nodes for the cluster pool.') +@minValue(1) +@maxValue(50) +param agentCount int = 2 + +@description('Disk size (in GB) to provision for each of the agent pool nodes. This value ranges from 0 to 1023. Specifying 0 will apply the default disk size for that agentVMSize.') +@minValue(0) +@maxValue(1023) +param osDiskSizeGB int = 0 + +@description('Name of the UAI for the compute cluster') +param computeUaiName string + +@description('Subnet ID') +param subnetId string + +@description('Subnet name') +param subnetName string = 'snet-training' + +@description('Tags to curate the resources in Azure.') +param tags object = {} + +// get an existing user assigned identify for this compute +resource uai 'Microsoft.ManagedIdentity/userAssignedIdentities@2022-01-31-preview' existing = { + name: computeUaiName +} + +var identityPrincipalId = uai.properties.principalId +var userAssignedIdentities = {'/subscriptions/${subscription().subscriptionId}/resourceGroups/${resourceGroup().name}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/${uai.name}': {}} + + +resource aks 'Microsoft.ContainerService/managedClusters@2022-05-02-preview' = { + name: computeName + location: computeRegion + tags: tags + identity: { + type: 'UserAssigned' + userAssignedIdentities: userAssignedIdentities + } + properties: { + dnsPrefix: dnsPrefix + //fqdnSubdomain: 'foo' + agentPoolProfiles: [ + { + name: 'compool' + count: agentCount + // enableAutoScaling: true + // maxCount: 5 + // minCount: 2 + + vmSize: agentVMSize + osType: 'Linux' + mode: 'System' + osDiskSizeGB: osDiskSizeGB + vnetSubnetID: subnetId + } + ] + apiServerAccessProfile: { + // IMPORTANT: use this for demo only, it is not a private AKS cluster + authorizedIPRanges: [] + enablePrivateCluster: false + enablePrivateClusterPublicFQDN: false + enableVnetIntegration: false + } + networkProfile:{ + networkPlugin: 'azure' + } + + } +} + +//module azuremlExtension '../azureml/deploy_aks_azureml_extension.bicep' = { +module azuremlExtension '../azureml/deploy_aks_azureml_extension_via_script.bicep' = { + name: 'deploy-aml-extension-${computeName}' + scope: resourceGroup() + params: { + clusterName: computeName + installNvidiaDevicePlugin: computeIsGPU + installDcgmExporter: computeIsGPU + } + dependsOn: [ + aks + ] +} + +module deployAttachToWorkspace '../azureml/attach_aks_training_to_azureml.bicep' = { + name: 'attach-${computeName}-to-aml-${machineLearningName}' + scope: resourceGroup() + params: { + machineLearningName: machineLearningName + machineLearningRegion: machineLearningRegion + aksResourceId: aks.id + aksRegion: aks.location + amlComputeName: computeName + computeUaiName: computeUaiName + } + dependsOn: [ + azuremlExtension + ] +} + +// output the compute config for next actions (permission model) +output identityPrincipalId string = identityPrincipalId +output compute string = aks.name +output region string = computeRegion +output subnetName string = subnetName diff --git a/mlops/bicep/modules/fl_pairs/open_aks_storage_pair.bicep b/mlops/bicep/modules/fl_pairs/open_aks_storage_pair.bicep new file mode 100644 index 00000000..687e5f27 --- /dev/null +++ b/mlops/bicep/modules/fl_pairs/open_aks_storage_pair.bicep @@ -0,0 +1,108 @@ +// This BICEP script will provision an AKS cluster, +// a new storage account, attached to a given AzureML workspace, without any specific security settings. + +// IMPORTANT: This setup is intended only for demo purpose. The data is still accessible +// by the users when opening the storage accounts, and data exfiltration is easy. + +// NOTE: this can take up to 15 minutes to complete + +// resource group must be specified as scope in az cli or module call +targetScope = 'resourceGroup' + +// required parameters +@description('Name of AzureML workspace to attach compute+storage to.') +param machineLearningName string + +@description('The region of the machine learning workspace') +param machineLearningRegion string = resourceGroup().location + +@description('Specifies the location of the pair resources.') +param pairRegion string = resourceGroup().location + +@description('Base name used for creating all pair resources.') +param pairBaseName string + +@description('Name of the storage account resource to create for the pair') +param storageAccountName string = replace('st${pairBaseName}','-','') // replace because only alphanumeric characters are supported + +@description('Name of the datastore for attaching the storage to the AzureML workspace.') +param datastoreName string = replace('datastore_${pairBaseName}','-','_') + +@description('The name of the Managed Cluster resource.') +param aksClusterName string = 'aks-${pairBaseName}' + + +@description('VM size for the compute cluster.') +param computeSKU string = 'Standard_DS3_v2' // 'Standard_DS3_v2' is for CPU; for GPU, 'Standard_NC6' would be a good default choice (don't forget to set computeIsGPU below to true if you want a GPU) + +@description('Boolean to indicate if the compute cluster should be a GPU cluster') +param computeIsGPU bool = false // change to true if you want to use a GPU + +@description('VM nodes for the compute cluster') +@minValue(1) +@maxValue(50) +param computeNodes int = 4 + +@description('Name of the UAI for the pair compute cluster') +param uaiName string = 'uai-${aksClusterName}' + +@description('Allow compute cluster to access storage account with R/W permissions (using UAI)') +param applyDefaultPermissions bool = true + +@description('Tags to curate the resources in Azure.') +param tags object = {} + +// create new blob storage and datastore +module storageDeployment '../storages/new_blob_storage_datastore.bicep' = { + name: '${pairBaseName}-open-storage' + scope: resourceGroup() + params: { + machineLearningName: machineLearningName + machineLearningRegion: machineLearningRegion + storageName: storageAccountName + storageRegion: pairRegion + datastoreName: datastoreName + publicNetworkAccess: 'Enabled' + tags: tags + } +} + +var aksClusterNameClean = substring(aksClusterName, 0, min(length(aksClusterName), 16)) + +module computeDeployment '../computes/open_new_aks.bicep' = { + name: '${pairBaseName}-open-aks-confcomp' + scope: resourceGroup() + params: { + machineLearningName: machineLearningName + machineLearningRegion: machineLearningRegion + aksClusterName: aksClusterNameClean + amlComputeName: aksClusterNameClean + computeRegion: pairRegion + agentVMSize: computeSKU + computeIsGPU: computeIsGPU + agentCount: computeNodes + computeUaiName: uaiName + tags: tags + } +} + +// Set R/W permissions for orchestrator UAI towards orchestrator storage +module pairInternalPermissions '../permissions/msi_storage_rw.bicep' = if(applyDefaultPermissions) { + name: '${pairBaseName}-internal-rw-perms' + scope: resourceGroup() + params: { + storageAccountName: storageDeployment.outputs.storageName + identityPrincipalId: computeDeployment.outputs.identityPrincipalId + } + dependsOn: [ + storageDeployment + computeDeployment + ] +} + +// output the pair config for next actions (permission model) +output identityPrincipalId string = computeDeployment.outputs.identityPrincipalId +output storageName string = storageDeployment.outputs.storageName +output storageServiceId string = storageDeployment.outputs.storageId +output computeName string = computeDeployment.outputs.compute +output region string = pairRegion diff --git a/mlops/bicep/modules/fl_pairs/vnet_aks_storage_pair.bicep b/mlops/bicep/modules/fl_pairs/vnet_aks_storage_pair.bicep index 3916b5f1..2d314d50 100644 --- a/mlops/bicep/modules/fl_pairs/vnet_aks_storage_pair.bicep +++ b/mlops/bicep/modules/fl_pairs/vnet_aks_storage_pair.bicep @@ -1,4 +1,4 @@ -// This BICEP script will provision an AKS cluster with confidential computes +// This BICEP script will provision an AKS cluster // in a given AzureML workspace, using a vnet and subnet to secure // the communication between compute and storage, plus managed identity // for permissions management. @@ -28,9 +28,11 @@ param datastoreName string = replace('datastore_${pairBaseName}','-','_') @description('Name of the default compute cluster for the pair') param computeName string = '${pairBaseName}-01' -// see https://learn.microsoft.com/en-us/azure/virtual-machines/dcasv5-dcadsv5-series @description('VM size for the compute cluster') -param computeSKU string = 'Standard_DC2as_v5' +param computeSKU string = 'Standard_DS3_v2' // 'Standard_DS3_v2' is for CPU; for GPU, 'Standard_NC6' would be a good default choice (don't forget to set computeIsGPU below to true if you want a GPU) + +@description('Boolean to indicate if the compute cluster should be a GPU cluster') +param computeIsGPU bool = false // change to true if you want to use a GPU @description('VM nodes for the default compute cluster') param computeNodes int = 4 @@ -120,7 +122,7 @@ resource uai 'Microsoft.ManagedIdentity/userAssignedIdentities@2022-01-31-previe // create new Azure ML compute -module computeDeployment '../computes/vnet_new_aks_with_confcomp.bicep' = { +module computeDeployment '../computes/vnet_new_aks.bicep' = { name: '${pairBaseName}-vnet-aml-compute' scope: resourceGroup() params: { @@ -131,6 +133,7 @@ module computeDeployment '../computes/vnet_new_aks_with_confcomp.bicep' = { computeName: computeName computeRegion: pairRegion agentVMSize: computeSKU + computeIsGPU: computeIsGPU agentCount: computeNodes // identity diff --git a/mlops/bicep/modules/fl_pairs/vnet_aks_with_confcomp_storage_pair.bicep b/mlops/bicep/modules/fl_pairs/vnet_aks_with_confcomp_storage_pair.bicep new file mode 100644 index 00000000..3916b5f1 --- /dev/null +++ b/mlops/bicep/modules/fl_pairs/vnet_aks_with_confcomp_storage_pair.bicep @@ -0,0 +1,211 @@ +// This BICEP script will provision an AKS cluster with confidential computes +// in a given AzureML workspace, using a vnet and subnet to secure +// the communication between compute and storage, plus managed identity +// for permissions management. + +// resource group must be specified as scope in az cli or module call +targetScope = 'resourceGroup' + +// required parameters +@description('Name of AzureML workspace to attach compute+storage to.') +param machineLearningName string + +@description('The region of the machine learning workspace') +param machineLearningRegion string = resourceGroup().location + +@description('Specifies the location of the pair resources.') +param pairRegion string = resourceGroup().location + +@description('Base name used for creating all pair resources.') +param pairBaseName string + +@description('Name of the storage account resource to create for the pair') +param storageAccountName string = replace('st${pairBaseName}','-','') // replace because only alphanumeric characters are supported + +@description('Name of the datastore for attaching the storage to the AzureML workspace.') +param datastoreName string = replace('datastore_${pairBaseName}','-','_') + +@description('Name of the default compute cluster for the pair') +param computeName string = '${pairBaseName}-01' + +// see https://learn.microsoft.com/en-us/azure/virtual-machines/dcasv5-dcadsv5-series +@description('VM size for the compute cluster') +param computeSKU string = 'Standard_DC2as_v5' + +@description('VM nodes for the default compute cluster') +param computeNodes int = 4 + +@description('Name of the UAI for the pair compute cluster') +param uaiName string = 'uai-${pairBaseName}' + +@description('Name of the Network Security Group resource') +param nsgResourceName string = 'nsg-${pairBaseName}' + +@description('Name of the vNET resource') +param vnetResourceName string = 'vnet-${pairBaseName}' + +@description('Virtual network address prefix') +param vnetAddressPrefix string + +@description('Subnet address prefix') +param subnetPrefix string + +@description('Use a static ip for storage PLE') +param useStorageStaticIP bool = false + +@description('Which static IP to use for storage PLE (if useStorageStaticIP is true)') +param storagePLEStaticIP string = '172.19.0.50' + +@description('Subnet name') +param subnetName string = 'snet-training' + +@description('Allow other subnets into the storage (need to be in the same region)') +param allowedSubnetIds array = [] + +@description('Enable compute node public IP') +param enableNodePublicIp bool = true + +@allowed(['Enabled','vNetOnly','Disabled']) +@description('Allow or disallow public network access to Storage Account.') +param storagePublicNetworkAccess string = 'Disabled' + +@description('Allow compute cluster to access storage account with R/W permissions (using UAI)') +param applyDefaultPermissions bool = true + +@description('Name of the private DNS zone for blob') +param blobPrivateDNSZoneName string = 'privatelink.blob.${environment().suffixes.storage}' + +@description('Location of the private DNS zone for blob') +param blobPrivateDNSZoneLocation string = 'global' + +@description('Tags to curate the resources in Azure.') +param tags object = {} + + +// Virtual network and network security group +module nsg '../networking/azureml_compute_nsg.bicep' = { + name: '${nsgResourceName}-deployment' + params: { + location: pairRegion + nsgName: nsgResourceName + tags: tags + workspaceRegion: machineLearningRegion + enableNodePublicIp: enableNodePublicIp + } +} + +module vnet '../networking/vnet.bicep' = { + name: '${vnetResourceName}-deployment' + params: { + location: pairRegion + virtualNetworkName: vnetResourceName + networkSecurityGroupId: nsg.outputs.id + vnetAddressPrefix: vnetAddressPrefix + subnets: [ + { + name: subnetName + addressPrefix: subnetPrefix + } + ] + tags: tags + } +} + +// provision a user assigned identify for this compute +resource uai 'Microsoft.ManagedIdentity/userAssignedIdentities@2022-01-31-preview' = { + name: uaiName + location: pairRegion + tags: tags +} + + +// create new Azure ML compute +module computeDeployment '../computes/vnet_new_aks_with_confcomp.bicep' = { + name: '${pairBaseName}-vnet-aml-compute' + scope: resourceGroup() + params: { + machineLearningName: machineLearningName + machineLearningRegion: machineLearningRegion + + // compute + computeName: computeName + computeRegion: pairRegion + agentVMSize: computeSKU + agentCount: computeNodes + + // identity + computeUaiName: uai.name + + // networking + subnetName: subnetName + subnetId: '${vnet.outputs.id}/subnets/${subnetName}' + + tags: tags + } +} + +// create new blob storage and datastore +module storageDeployment '../storages/new_blob_storage_datastore.bicep' = { + name: '${pairBaseName}-vnet-storage' + scope: resourceGroup() + params: { + machineLearningName: machineLearningName + machineLearningRegion: machineLearningRegion + storageName: storageAccountName + storageRegion: pairRegion + datastoreName: datastoreName + publicNetworkAccess: storagePublicNetworkAccess + subnetIds: concat( + ['${vnet.outputs.id}/subnets/${subnetName}'], + allowedSubnetIds + ) + tags: tags + } +} + +// Create a private service endpoints internal to each pair for their respective storages +module pairStoragePrivateEndpoint '../networking/private_endpoint.bicep' = if (storagePublicNetworkAccess == 'Disabled') { + name: '${pairBaseName}-endpoint-to-insilo-storage' + scope: resourceGroup() + params: { + location: pairRegion + tags: tags + resourceServiceId: storageDeployment.outputs.storageId + resourceName: storageDeployment.outputs.storageName + pleRootName: 'ple-${storageDeployment.outputs.storageName}-to-${pairBaseName}-st-blob' + virtualNetworkId: vnet.outputs.id + subnetId: '${vnet.outputs.id}/subnets/${subnetName}' + useStaticIPAddress: useStorageStaticIP + privateIPAddress: storagePLEStaticIP + privateDNSZoneName: blobPrivateDNSZoneName + privateDNSZoneLocation: blobPrivateDNSZoneLocation + groupId: 'blob' + } + dependsOn: [ + storageDeployment + ] +} + +// Set R/W permissions for orchestrator UAI towards orchestrator storage +module pairInternalPermissions '../permissions/msi_storage_rw.bicep' = if(applyDefaultPermissions) { + name: '${pairBaseName}-internal-rw-perms' + scope: resourceGroup() + params: { + storageAccountName: storageDeployment.outputs.storageName + identityPrincipalId: computeDeployment.outputs.identityPrincipalId + } + dependsOn: [ + storageDeployment + computeDeployment + ] +} + +// output the pair config for next actions (permission model) +output identityPrincipalId string = computeDeployment.outputs.identityPrincipalId +output storageName string = storageDeployment.outputs.storageName +output storageServiceId string = storageDeployment.outputs.storageId +output computeName string = computeDeployment.outputs.compute +output region string = pairRegion +output vnetName string = vnet.outputs.name +output vnetId string = vnet.outputs.id +output subnetId string = '${vnet.outputs.id}/subnets/${subnetName}' diff --git a/mlops/bicep/vnet_publicip_sandbox_aks_confcomp_setup.bicep b/mlops/bicep/vnet_publicip_sandbox_aks_confcomp_setup.bicep index 2c12dd0c..e2ea652c 100644 --- a/mlops/bicep/vnet_publicip_sandbox_aks_confcomp_setup.bicep +++ b/mlops/bicep/vnet_publicip_sandbox_aks_confcomp_setup.bicep @@ -99,7 +99,7 @@ var orchestratorStorageAccountName = replace('st${demoBaseName}orch','-','') var orchestratorStorageAccountCleanName = substring(orchestratorStorageAccountName, 0, min(length(orchestratorStorageAccountName),24)) // Create an orchestrator compute+storage pair and attach to workspace -module orchestrator './modules/fl_pairs/vnet_aks_storage_pair.bicep' = { +module orchestrator './modules/fl_pairs/vnet_aks_with_confcomp_storage_pair.bicep' = { name: '${demoBaseName}-vnetpair-orchestrator' scope: resourceGroup() params: { @@ -155,7 +155,7 @@ var siloCount = length(siloRegions) // Create all silos as a compute+storage pair and attach to workspace // This pair will be considered eyes-off -module silos './modules/fl_pairs/vnet_aks_storage_pair.bicep' = [for i in range(0, siloCount): { +module silos './modules/fl_pairs/vnet_aks_with_confcomp_storage_pair.bicep' = [for i in range(0, siloCount): { name: '${demoBaseName}-vnetpair-silo-${i}' scope: resourceGroup() params: {