diff --git a/configuration.ipynb b/configuration.ipynb index 761aa8fde..91163dc85 100644 --- a/configuration.ipynb +++ b/configuration.ipynb @@ -103,7 +103,7 @@ "source": [ "import azureml.core\n", "\n", - "print(\"This notebook was created using version 1.2.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.3.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/how-to-use-azureml/automated-machine-learning/README.md b/how-to-use-azureml/automated-machine-learning/README.md index 94af87ae6..ec5aa15a8 100644 --- a/how-to-use-azureml/automated-machine-learning/README.md +++ b/how-to-use-azureml/automated-machine-learning/README.md @@ -117,7 +117,7 @@ jupyter notebook - Simple example of using automated ML for regression - Uses azure compute for training -- [auto-ml-regression-hardware-performance-explanation-and-featurization.ipynb](regression-hardware-performance-explanation-and-featurization/auto-ml-regression-hardware-performance-explanation-and-featurization.ipynb) +- [auto-ml-regression-explanation-featurization.ipynb](regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb) - Dataset: Hardware Performance Dataset - Shows featurization and excplanation - Uses azure compute for training @@ -152,7 +152,7 @@ jupyter notebook - Beer Production Forecasting - [auto-ml-continuous-retraining.ipynb](continuous-retraining/auto-ml-continuous-retraining.ipynb) - - Continous retraining using Pipelines and Time-Series TabularDataset + - Continuous retraining using Pipelines and Time-Series TabularDataset - [auto-ml-classification-text-dnn.ipynb](classification-text-dnn/auto-ml-classification-text-dnn.ipynb) - Classification with text data using deep learning in AutoML diff --git a/how-to-use-azureml/automated-machine-learning/automl_env.yml b/how-to-use-azureml/automated-machine-learning/automl_env.yml index 569f205e1..c8bfe39f3 100644 --- a/how-to-use-azureml/automated-machine-learning/automl_env.yml +++ b/how-to-use-azureml/automated-machine-learning/automl_env.yml @@ -26,7 +26,6 @@ dependencies: - azureml-train - azureml-widgets - azureml-pipeline - - azureml-contrib-interpret - pytorch-transformers==1.0.0 - spacy==2.1.8 - onnxruntime==1.0.0 diff --git a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml index 1027dc27e..697494672 100644 --- a/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml +++ b/how-to-use-azureml/automated-machine-learning/automl_env_mac.yml @@ -27,7 +27,6 @@ dependencies: - azureml-train - azureml-widgets - azureml-pipeline - - azureml-contrib-interpret - pytorch-transformers==1.0.0 - spacy==2.1.8 - onnxruntime==1.0.0 @@ -36,4 +35,4 @@ dependencies: channels: - anaconda - conda-forge -- pytorch \ No newline at end of file +- pytorch diff --git a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb index 6a32356d4..351332c03 100644 --- a/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb +++ b/how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb @@ -92,6 +92,23 @@ "from azureml.explain.model._internal.explanation_client import ExplanationClient" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This sample notebook may use features that are not available in previous versions of the Azure ML SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"This notebook was created using version 1.3.0 of the Azure ML SDK\")\n", + "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -132,7 +149,6 @@ "experiment=Experiment(ws, experiment_name)\n", "\n", "output = {}\n", - "output['SDK version'] = azureml.core.VERSION\n", "output['Subscription ID'] = ws.subscription_id\n", "output['Workspace'] = ws.name\n", "output['Resource Group'] = ws.resource_group\n", @@ -160,35 +176,22 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.core.compute import AmlCompute\n", - "from azureml.core.compute import ComputeTarget\n", + "from azureml.core.compute import ComputeTarget, AmlCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", "\n", - "# Choose a name for your cluster.\n", - "amlcompute_cluster_name = \"cpu-cluster-4\"\n", + "# Choose a name for your CPU cluster\n", + "cpu_cluster_name = \"cpu-cluster-4\"\n", "\n", - "found = False\n", - "# Check if this compute target already exists in the workspace.\n", - "cts = ws.compute_targets\n", - "if amlcompute_cluster_name in cts and cts[amlcompute_cluster_name].type == 'AmlCompute':\n", - " found = True\n", - " print('Found existing compute target.')\n", - " compute_target = cts[amlcompute_cluster_name]\n", - " \n", - "if not found:\n", - " print('Creating a new compute target...')\n", - " provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\", # for GPU, use \"STANDARD_NC6\"\n", - " #vm_priority = 'lowpriority', # optional\n", - " max_nodes = 6)\n", - "\n", - " # Create the cluster.\n", - " compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n", - " \n", - "print('Checking cluster status...')\n", - "# Can poll for a minimum number of nodes and for a specific timeout.\n", - "# If no min_node_count is provided, it will use the scale settings for the cluster.\n", - "compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n", - " \n", - "# For a more detailed view of current AmlCompute status, use get_status()." + "# Verify that cluster does not exist already\n", + "try:\n", + " compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)\n", + " print('Found existing cluster, use it.')\n", + "except ComputeTargetException:\n", + " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',\n", + " max_nodes=6)\n", + " compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)\n", + "\n", + "compute_target.wait_for_completion(show_output=True)" ] }, { @@ -394,8 +397,6 @@ "outputs": [], "source": [ "#from azureml.train.automl.run import AutoMLRun\n", - "#experiment_name = 'automl-classification-bmarketing'\n", - "#experiment = Experiment(ws, experiment_name)\n", "#remote_run = AutoMLRun(experiment=experiment, run_id='>') # this name is model.id of model that we want to deploy\n", - " # deserialize the model file back into a sklearn model\n", - " model = joblib.load(model_path)\n", - "\n", - "def run(raw_data):\n", - " try:\n", - " data = (pd.DataFrame(np.array(json.loads(raw_data)['data']), columns=[str(i) for i in range(0,64)]))\n", - " result = model.predict(data)\n", - " except Exception as e:\n", - " result = str(e)\n", - " return json.dumps({\"error\": result})\n", - " return json.dumps({\"result\":result.tolist()})" + "## Register the Fitted Model for Deployment\n", + "If neither metric nor iteration are specified in the register_model call, the iteration with the best primary metric is registered." ] }, { @@ -613,43 +586,19 @@ "metadata": {}, "outputs": [], "source": [ - "#Replace <>\n", - "content = \"\"\n", - "with open(\"score.py\", \"r\") as fo:\n", - " content = fo.read()\n", - "\n", - "new_content = content.replace(\"<>\", local_run.model_id)\n", - "with open(\"score.py\", \"w\") as fw:\n", - " fw.write(new_content)" + "description = 'AutoML Model'\n", + "tags = None\n", + "model = local_run.register_model(description = description, tags = tags)\n", + "local_run.model_id # This will be written to the scoring script file later in the notebook." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Create a YAML File for the Environment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.conda_dependencies import CondaDependencies\n", + "### Deploy the model as a Web Service on Azure Container Instance\n", "\n", - "myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'], pip_packages=['azureml-defaults', 'azureml-sdk[automl]'])\n", - "\n", - "conda_env_file_name = 'myenv.yml'\n", - "myenv.save_to_file('.', conda_env_file_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Deploy the model as a Web Service on Azure Container Instance\n", - "Replace servicename with any meaningful name of service" + "Create the configuration needed for deploying the model as a web service service." ] }, { @@ -658,47 +607,17 @@ "metadata": {}, "outputs": [], "source": [ - "# this will take 10-15 minutes to finish\n", - "\n", - "from azureml.core.webservice import AciWebservice, Webservice\n", - "from azureml.exceptions import WebserviceException\n", "from azureml.core.model import InferenceConfig\n", - "from azureml.core.model import Model\n", + "from azureml.core.webservice import AciWebservice\n", "from azureml.core.environment import Environment\n", - "from azureml.core.conda_dependencies import CondaDependencies\n", - "import uuid\n", - "\n", "\n", - "myaci_config = AciWebservice.deploy_configuration(\n", - " cpu_cores = 2, \n", - " memory_gb = 2, \n", - " tags = {'name':'Databricks Azure ML ACI'}, \n", - " description = 'This is for ADB and AutoML example.')\n", + "myenv = Environment.from_conda_specification(name=\"myenv\", file_path=conda_env_file_name)\n", + "inference_config = InferenceConfig(entry_script=script_file_name, environment=myenv)\n", "\n", - "myenv = Environment.get(ws, name='AzureML-PySpark-MmlSpark-0.15')\n", - "# we need to add extra packages to procured environment\n", - "# in order to deploy amended environment we need to rename it\n", - "myenv.name = 'myenv'\n", - "model_dependencies = CondaDependencies('myenv.yml')\n", - "for pip_dep in model_dependencies.pip_packages:\n", - " myenv.python.conda_dependencies.add_pip_package(pip_dep)\n", - "for conda_dep in model_dependencies.conda_packages:\n", - " myenv.python.conda_dependencies.add_conda_package(conda_dep)\n", - "inference_config = InferenceConfig(entry_script='score_sparkml.py', environment=myenv)\n", - "\n", - "guid = str(uuid.uuid4()).split(\"-\")[0]\n", - "service_name = \"myservice-{}\".format(guid)\n", - "\n", - "# Remove any existing service under the same name.\n", - "try:\n", - " Webservice(ws, service_name).delete()\n", - "except WebserviceException:\n", - " pass\n", - "\n", - "print(\"Creating service with name: {}\".format(service_name))\n", - "\n", - "myservice = Model.deploy(ws, service_name, [model], inference_config, myaci_config)\n", - "myservice.wait_for_deployment(show_output=True)" + "aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, \n", + " memory_gb = 1, \n", + " tags = {'area': \"digits\", 'type': \"automl_classification\"}, \n", + " description = 'sample service for Automl Classification')" ] }, { @@ -707,8 +626,14 @@ "metadata": {}, "outputs": [], "source": [ - "#for using the Web HTTP API \n", - "print(myservice.scoring_uri)" + "from azureml.core.webservice import Webservice\n", + "from azureml.core.model import Model\n", + "\n", + "aci_service_name = 'automl-databricks-local'\n", + "print(aci_service_name)\n", + "aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)\n", + "aci_service.wait_for_deployment(True)\n", + "print(aci_service.state)" ] }, { @@ -752,7 +677,7 @@ "for index in np.random.choice(len(y_test), 2, replace = False):\n", " print(index)\n", " test_sample = json.dumps({'data':X_test[index:index + 1].values.tolist()})\n", - " predicted = myservice.run(input_data = test_sample)\n", + " predicted = aci_service.run(input_data = test_sample)\n", " label = y_test.values[index]\n", " predictedDict = json.loads(predicted)\n", " title = \"Label value = %d Predicted value = %s \" % ( label,predictedDict['result'][0]) \n", diff --git a/how-to-use-azureml/deployment/deploy-multi-model/multi-model-register-and-deploy.ipynb b/how-to-use-azureml/deployment/deploy-multi-model/multi-model-register-and-deploy.ipynb index 7c3d7ed75..1fbc55118 100644 --- a/how-to-use-azureml/deployment/deploy-multi-model/multi-model-register-and-deploy.ipynb +++ b/how-to-use-azureml/deployment/deploy-multi-model/multi-model-register-and-deploy.ipynb @@ -285,7 +285,7 @@ "from azureml.exceptions import WebserviceException\n", "\n", "deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)\n", - "aci_service_name = 'aciservice1'\n", + "aci_service_name = 'aciservice-multimodel'\n", "\n", "try:\n", " # if you want to get existing service below is the command\n", diff --git a/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb b/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb index 44bfde497..6c81ead93 100644 --- a/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb +++ b/how-to-use-azureml/deployment/deploy-to-cloud/model-register-and-deploy.ipynb @@ -388,6 +388,14 @@ "Below is an example of how you can construct an input dataset to profile a service which expects its incoming requests to contain serialized json. In this case we created a dataset based one hundred instances of the same request data. In real world scenarios however, we suggest that you use larger datasets with various inputs, especially if your model resource usage/behavior is input dependent." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may want to register datasets using the register() method to your workspace so they can be shared with others, reused and referred to by name in your script.\n", + "You can try get the dataset first to see if it's already registered." + ] + }, { "cell_type": "code", "execution_count": null, @@ -398,36 +406,45 @@ "from azureml.core.dataset import Dataset\n", "from azureml.data import dataset_type_definitions\n", "\n", + "dataset_name='diabetes_sample_request_data'\n", "\n", - "# create a string that can be utf-8 encoded and\n", - "# put in the body of the request\n", - "serialized_input_json = json.dumps({\n", - " 'data': [\n", - " [ 0.03807591, 0.05068012, 0.06169621, 0.02187235, -0.0442235,\n", - " -0.03482076, -0.04340085, -0.00259226, 0.01990842, -0.01764613]\n", - " ]\n", - "})\n", - "dataset_content = []\n", - "for i in range(100):\n", - " dataset_content.append(serialized_input_json)\n", - "dataset_content = '\\n'.join(dataset_content)\n", - "file_name = 'sample_request_data.txt'\n", - "f = open(file_name, 'w')\n", - "f.write(dataset_content)\n", - "f.close()\n", - "\n", - "# upload the txt file created above to the Datastore and create a dataset from it\n", - "data_store = Datastore.get_default(ws)\n", - "data_store.upload_files(['./' + file_name], target_path='sample_request_data')\n", - "datastore_path = [(data_store, 'sample_request_data' +'/' + file_name)]\n", - "sample_request_data = Dataset.Tabular.from_delimited_files(\n", - " datastore_path,\n", - " separator='\\n',\n", - " infer_column_types=True,\n", - " header=dataset_type_definitions.PromoteHeadersBehavior.NO_HEADERS)\n", - "sample_request_data = sample_request_data.register(workspace=ws,\n", - " name='diabetes_sample_request_data',\n", - " create_new_version=True)" + "dataset_registered = False\n", + "try:\n", + " sample_request_data = Dataset.get_by_name(workspace = ws, name = dataset_name)\n", + " dataset_registered = True\n", + "except:\n", + " print(\"The dataset {} is not registered in workspace yet.\".format(dataset_name))\n", + "\n", + "if not dataset_registered:\n", + " # create a string that can be utf-8 encoded and\n", + " # put in the body of the request\n", + " serialized_input_json = json.dumps({\n", + " 'data': [\n", + " [ 0.03807591, 0.05068012, 0.06169621, 0.02187235, -0.0442235,\n", + " -0.03482076, -0.04340085, -0.00259226, 0.01990842, -0.01764613]\n", + " ]\n", + " })\n", + " dataset_content = []\n", + " for i in range(100):\n", + " dataset_content.append(serialized_input_json)\n", + " dataset_content = '\\n'.join(dataset_content)\n", + " file_name = \"{}.txt\".format(dataset_name)\n", + " f = open(file_name, 'w')\n", + " f.write(dataset_content)\n", + " f.close()\n", + "\n", + " # upload the txt file created above to the Datastore and create a dataset from it\n", + " data_store = Datastore.get_default(ws)\n", + " data_store.upload_files(['./' + file_name], target_path='sample_request_data')\n", + " datastore_path = [(data_store, 'sample_request_data' +'/' + file_name)]\n", + " sample_request_data = Dataset.Tabular.from_delimited_files(\n", + " datastore_path,\n", + " separator='\\n',\n", + " infer_column_types=True,\n", + " header=dataset_type_definitions.PromoteHeadersBehavior.NO_HEADERS)\n", + " sample_request_data = sample_request_data.register(workspace=ws,\n", + " name=dataset_name,\n", + " create_new_version=True)" ] }, { @@ -512,7 +529,7 @@ "metadata": { "authors": [ { - "name": "aashishb" + "name": "vaidyas" } ], "category": "deployment", diff --git a/how-to-use-azureml/deployment/onnx/onnx-model-register-and-deploy.ipynb b/how-to-use-azureml/deployment/onnx/onnx-model-register-and-deploy.ipynb index f0334e854..42fdb6d8a 100644 --- a/how-to-use-azureml/deployment/onnx/onnx-model-register-and-deploy.ipynb +++ b/how-to-use-azureml/deployment/onnx/onnx-model-register-and-deploy.ipynb @@ -202,7 +202,7 @@ "metadata": { "authors": [ { - "name": "aashishb" + "name": "vaidyas" } ], "kernelspec": { diff --git a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb index adbd1f3ec..aed26009b 100644 --- a/how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb +++ b/how-to-use-azureml/deployment/production-deploy-to-aks-gpu/production-deploy-to-aks-gpu.ipynb @@ -288,7 +288,7 @@ "metadata": { "authors": [ { - "name": "aashishb" + "name": "vaidyas" } ], "kernelspec": { diff --git a/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb b/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb index 8878db806..5ea43c868 100644 --- a/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb +++ b/how-to-use-azureml/deployment/production-deploy-to-aks/production-deploy-to-aks.ipynb @@ -217,6 +217,14 @@ "Below is an example of how you can construct an input dataset to profile a service which expects its incoming requests to contain serialized json. In this case we created a dataset based one hundred instances of the same request data. In real world scenarios however, we suggest that you use larger datasets with various inputs, especially if your model resource usage/behavior is input dependent." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may want to register datasets using the register() method to your workspace so they can be shared with others, reused and referred to by name in your script.\n", + "You can try get the dataset first to see if it's already registered." + ] + }, { "cell_type": "code", "execution_count": null, @@ -228,31 +236,41 @@ "from azureml.core.dataset import Dataset\n", "from azureml.data import dataset_type_definitions\n", "\n", - "input_json = {'data': [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n", - " [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]]}\n", - "# create a string that can be put in the body of the request\n", - "serialized_input_json = json.dumps(input_json)\n", - "dataset_content = []\n", - "for i in range(100):\n", - " dataset_content.append(serialized_input_json)\n", - "sample_request_data = '\\n'.join(dataset_content)\n", - "file_name = 'sample_request_data.txt'\n", - "f = open(file_name, 'w')\n", - "f.write(sample_request_data)\n", - "f.close()\n", - "\n", - "# upload the txt file created above to the Datastore and create a dataset from it\n", - "data_store = Datastore.get_default(ws)\n", - "data_store.upload_files(['./' + file_name], target_path='sample_request_data')\n", - "datastore_path = [(data_store, 'sample_request_data' +'/' + file_name)]\n", - "sample_request_data = Dataset.Tabular.from_delimited_files(\n", - " datastore_path,\n", - " separator='\\n',\n", - " infer_column_types=True,\n", - " header=dataset_type_definitions.PromoteHeadersBehavior.NO_HEADERS)\n", - "sample_request_data = sample_request_data.register(workspace=ws,\n", - " name='sample_request_data',\n", - " create_new_version=True)" + "dataset_name='sample_request_data'\n", + "\n", + "dataset_registered = False\n", + "try:\n", + " sample_request_data = Dataset.get_by_name(workspace = ws, name = dataset_name)\n", + " dataset_registered = True\n", + "except:\n", + " print(\"The dataset {} is not registered in workspace yet.\".format(dataset_name))\n", + "\n", + "if not dataset_registered:\n", + " input_json = {'data': [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n", + " [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]]}\n", + " # create a string that can be put in the body of the request\n", + " serialized_input_json = json.dumps(input_json)\n", + " dataset_content = []\n", + " for i in range(100):\n", + " dataset_content.append(serialized_input_json)\n", + " sample_request_data = '\\n'.join(dataset_content)\n", + " file_name = \"{}.txt\".format(dataset_name)\n", + " f = open(file_name, 'w')\n", + " f.write(sample_request_data)\n", + " f.close()\n", + "\n", + " # upload the txt file created above to the Datastore and create a dataset from it\n", + " data_store = Datastore.get_default(ws)\n", + " data_store.upload_files(['./' + file_name], target_path='sample_request_data')\n", + " datastore_path = [(data_store, 'sample_request_data' +'/' + file_name)]\n", + " sample_request_data = Dataset.Tabular.from_delimited_files(\n", + " datastore_path,\n", + " separator='\\n',\n", + " infer_column_types=True,\n", + " header=dataset_type_definitions.PromoteHeadersBehavior.NO_HEADERS)\n", + " sample_request_data = sample_request_data.register(workspace=ws,\n", + " name=dataset_name,\n", + " create_new_version=True)" ] }, { @@ -560,7 +578,7 @@ "metadata": { "authors": [ { - "name": "aashishb" + "name": "vaidyas" } ], "kernelspec": { diff --git a/how-to-use-azureml/deployment/spark/model-register-and-deploy-spark.ipynb b/how-to-use-azureml/deployment/spark/model-register-and-deploy-spark.ipynb index 90c0c2783..a254f8982 100644 --- a/how-to-use-azureml/deployment/spark/model-register-and-deploy-spark.ipynb +++ b/how-to-use-azureml/deployment/spark/model-register-and-deploy-spark.ipynb @@ -302,7 +302,7 @@ "metadata": { "authors": [ { - "name": "aashishb" + "name": "vaidyas" } ], "category": "deployment", diff --git a/how-to-use-azureml/deployment/tensorflow/tensorflow-model-register-and-deploy.ipynb b/how-to-use-azureml/deployment/tensorflow/tensorflow-model-register-and-deploy.ipynb index 10a33aa5a..7cdbcca4c 100644 --- a/how-to-use-azureml/deployment/tensorflow/tensorflow-model-register-and-deploy.ipynb +++ b/how-to-use-azureml/deployment/tensorflow/tensorflow-model-register-and-deploy.ipynb @@ -234,7 +234,7 @@ "metadata": { "authors": [ { - "name": "aashishb" + "name": "vaidyas" } ], "kernelspec": { diff --git a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb index b444686b5..2cae8957d 100644 --- a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.ipynb @@ -687,12 +687,12 @@ "source": [ "## Next\n", "Learn about other use cases of the explain package on a:\n", - "1. [Training time: regression problem](../../tabular-data/explain-binary-classification-local.ipynb) \n", - "1. [Training time: binary classification problem](../../tabular-data/explain-binary-classification-local.ipynb)\n", - "1. [Training time: multiclass classification problem](../../tabular-data/explain-multiclass-classification-local.ipynb)\n", + "1. [Training time: regression problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-regression-local.ipynb) \n", + "1. [Training time: binary classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-binary-classification-local.ipynb)\n", + "1. [Training time: multiclass classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-multiclass-classification-local.ipynb)\n", "1. Explain models with engineered features:\n", - " 1. [Simple feature transformations](../../tabular-data/simple-feature-transformations-explain-local.ipynb)\n", - " 1. [Advanced feature transformations](../../tabular-data/advanced-feature-transformations-explain-local.ipynb)\n", + " 1. [Simple feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/simple-feature-transformations-explain-local.ipynb)\n", + " 1. [Advanced feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/advanced-feature-transformations-explain-local.ipynb)\n", "1. [Save model explanations via Azure Machine Learning Run History](../run-history/save-retrieve-explanations-run-history.ipynb)\n", "1. Inferencing time: deploy a classification model and explainer:\n", " 1. [Deploy a locally-trained model and explainer](../scoring-time/train-explain-model-locally-and-deploy.ipynb)\n", diff --git a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.yml b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.yml index aaa7070f9..7a4b22f77 100644 --- a/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.yml +++ b/how-to-use-azureml/explain-model/azure-integration/remote-explanation/explain-model-on-amlcompute.yml @@ -3,6 +3,8 @@ dependencies: - pip: - azureml-sdk - azureml-interpret + - interpret-community[visualization] + - matplotlib - azureml-contrib-interpret - sklearn-pandas - azureml-dataprep diff --git a/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb b/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb index 1343d51bc..60dca68de 100644 --- a/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.ipynb @@ -582,12 +582,12 @@ "source": [ "## Next\n", "Learn about other use cases of the explain package on a:\n", - "1. [Training time: regression problem](../../tabular-data/explain-binary-classification-local.ipynb) \n", - "1. [Training time: binary classification problem](../../tabular-data/explain-binary-classification-local.ipynb)\n", - "1. [Training time: multiclass classification problem](../../tabular-data/explain-multiclass-classification-local.ipynb)\n", + "1. [Training time: regression problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-regression-local.ipynb) \n", + "1. [Training time: binary classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-binary-classification-local.ipynb)\n", + "1. [Training time: multiclass classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-multiclass-classification-local.ipynb)\n", "1. Explain models with engineered features:\n", - " 1. [Simple feature transformations](../../tabular-data/simple-feature-transformations-explain-local.ipynb)\n", - " 1. [Advanced feature transformations](../../tabular-data/advanced-feature-transformations-explain-local.ipynb)\n", + " 1. [Simple feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/simple-feature-transformations-explain-local.ipynb)\n", + " 1. [Advanced feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/advanced-feature-transformations-explain-local.ipynb)\n", "1. [Run explainers remotely on Azure Machine Learning Compute (AMLCompute)](../remote-explanation/explain-model-on-amlcompute.ipynb)\n", "1. Inferencing time: deploy a classification model and explainer:\n", " 1. [Deploy a locally-trained model and explainer](../scoring-time/train-explain-model-locally-and-deploy.ipynb)\n", diff --git a/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.yml b/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.yml index 2dee986fa..ff76d75f3 100644 --- a/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.yml +++ b/how-to-use-azureml/explain-model/azure-integration/run-history/save-retrieve-explanations-run-history.yml @@ -3,5 +3,7 @@ dependencies: - pip: - azureml-sdk - azureml-interpret + - interpret-community[visualization] + - matplotlib - azureml-contrib-interpret - ipywidgets diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb index 24b34c528..7a1af0297 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.ipynb @@ -445,12 +445,12 @@ "source": [ "## Next\n", "Learn about other use cases of the explain package on a:\n", - "1. [Training time: regression problem](../../tabular-data/explain-binary-classification-local.ipynb) \n", - "1. [Training time: binary classification problem](../../tabular-data/explain-binary-classification-local.ipynb)\n", - "1. [Training time: multiclass classification problem](../../tabular-data/explain-multiclass-classification-local.ipynb)\n", + "1. [Training time: regression problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-regression-local.ipynb) \n", + "1. [Training time: binary classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-binary-classification-local.ipynb)\n", + "1. [Training time: multiclass classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-multiclass-classification-local.ipynb)\n", "1. Explain models with engineered features:\n", - " 1. [Simple feature transformations](../../tabular-data/simple-feature-transformations-explain-local.ipynb)\n", - " 1. [Advanced feature transformations](../../tabular-data/advanced-feature-transformations-explain-local.ipynb)\n", + " 1. [Simple feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/simple-feature-transformations-explain-local.ipynb)\n", + " 1. [Advanced feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/advanced-feature-transformations-explain-local.ipynb)\n", "1. [Save model explanations via Azure Machine Learning Run History](../run-history/save-retrieve-explanations-run-history.ipynb)\n", "1. [Run explainers remotely on Azure Machine Learning Compute (AMLCompute)](../remote-explanation/explain-model-on-amlcompute.ipynb)\n", "1. [Inferencing time: deploy a remotely-trained model and explainer](./train-explain-model-on-amlcompute-and-deploy.ipynb)" diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.yml b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.yml index 7236444a6..b7a4cd6fd 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.yml +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-locally-and-deploy.yml @@ -3,6 +3,8 @@ dependencies: - pip: - azureml-sdk - azureml-interpret + - interpret-community[visualization] + - matplotlib - azureml-contrib-interpret - sklearn-pandas - ipywidgets diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb index a8f8b88f9..946eef5ca 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.ipynb @@ -483,16 +483,15 @@ "source": [ "## Next\n", "Learn about other use cases of the explain package on a:\n", - "1. [Training time: regression problem](../../tabular-data/explain-binary-classification-local.ipynb) \n", - "1. [Training time: binary classification problem](../../tabular-data/explain-binary-classification-local.ipynb)\n", - "1. [Training time: multiclass classification problem](../../tabular-data/explain-multiclass-classification-local.ipynb)\n", + "1. [Training time: regression problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-regression-local.ipynb) \n", + "1. [Training time: binary classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-binary-classification-local.ipynb)\n", + "1. [Training time: multiclass classification problem](https://github.com/interpretml/interpret-community/blob/master/notebooks/explain-multiclass-classification-local.ipynb)\n", "1. Explain models with engineered features:\n", - " 1. [Simple feature transformations](../../tabular-data/simple-feature-transformations-explain-local.ipynb)\n", - " 1. [Advanced feature transformations](../../tabular-data/advanced-feature-transformations-explain-local.ipynb)\n", + " 1. [Simple feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/simple-feature-transformations-explain-local.ipynb)\n", + " 1. [Advanced feature transformations](https://github.com/interpretml/interpret-community/blob/master/notebooks/advanced-feature-transformations-explain-local.ipynb)\n", "1. [Save model explanations via Azure Machine Learning Run History](../run-history/save-retrieve-explanations-run-history.ipynb)\n", "1. [Run explainers remotely on Azure Machine Learning Compute (AMLCompute)](../remote-explanation/explain-model-on-amlcompute.ipynb)\n", - "1. [Inferencing time: deploy a locally-trained model and explainer](./train-explain-model-locally-and-deploy.ipynb)\n", - " " + "1. [Inferencing time: deploy a locally-trained model and explainer](./train-explain-model-locally-and-deploy.ipynb)" ] }, { diff --git a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.yml b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.yml index ab91cef6f..ab5f0f94b 100644 --- a/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.yml +++ b/how-to-use-azureml/explain-model/azure-integration/scoring-time/train-explain-model-on-amlcompute-and-deploy.yml @@ -3,6 +3,8 @@ dependencies: - pip: - azureml-sdk - azureml-interpret + - interpret-community[visualization] + - matplotlib - azureml-contrib-interpret - sklearn-pandas - azureml-dataprep diff --git a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb index 7b1017c6a..34a2a723e 100644 --- a/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb +++ b/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb @@ -105,7 +105,7 @@ "metadata": {}, "source": [ "## Create an Azure ML experiment\n", - "Let's create an experiment named \"automl-classification\" and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure.\n", + "Let's create an experiment named \"automlstep-classification\" and a folder to hold the training scripts. The script runs will be recorded under the experiment in Azure.\n", "\n", "The best practice is to use separate folders for scripts and its dependent files for each step and specify that folder as the `source_directory` for the step. This helps reduce the size of the snapshot created for the step (only the specific folder is snapshotted). Since changes in any files in the `source_directory` would trigger a re-upload of the snapshot, this helps keep the reuse of the step when there are no changes in the `source_directory` of the step." ] @@ -165,20 +165,6 @@ " # For a more detailed view of current AmlCompute status, use get_status()." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a new RunConfig object\n", - "conda_run_config = RunConfiguration(framework=\"python\")\n", - "cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'])\n", - "conda_run_config.environment.python.conda_dependencies = cd\n", - "\n", - "print('run config is ready')" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -192,19 +178,30 @@ "metadata": {}, "outputs": [], "source": [ - "# The data referenced here was a 1MB simple random sample of the Chicago Crime data into a local temporary directory.\n", - "example_data = 'https://dprepdata.blob.core.windows.net/demo/crime0-random.csv'\n", - "dataset = Dataset.Tabular.from_delimited_files(example_data)\n", - "dataset.to_pandas_dataframe().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset.take(5).to_pandas_dataframe()" + "# Try to load the dataset from the Workspace. Otherwise, create it from the file\n", + "found = False\n", + "key = \"Crime-Dataset\"\n", + "description_text = \"Crime Dataset (used in the the aml-pipelines-with-automated-machine-learning-step.ipynb notebook)\"\n", + "\n", + "if key in ws.datasets.keys(): \n", + " found = True\n", + " dataset = ws.datasets[key] \n", + "\n", + "if not found:\n", + " # Create AML Dataset and register it into Workspace\n", + " # The data referenced here was a 1MB simple random sample of the Chicago Crime data into a local temporary directory.\n", + " example_data = 'https://dprepdata.blob.core.windows.net/demo/crime0-random.csv'\n", + " dataset = Dataset.Tabular.from_delimited_files(example_data)\n", + " dataset = dataset.drop_columns(['FBI Code'])\n", + " \n", + " #Register Dataset in Workspace\n", + " dataset = dataset.register(workspace=ws,\n", + " name=key,\n", + " description=description_text)\n", + "\n", + "\n", + "df = dataset.to_pandas_dataframe()\n", + "df.describe()" ] }, { @@ -224,9 +221,7 @@ "metadata": {}, "outputs": [], "source": [ - "X = dataset.drop_columns(columns=['Primary Type', 'FBI Code'])\n", - "y = dataset.keep_columns(columns=['Primary Type'], validate=True)\n", - "print('X and y are ready!')" + "dataset.take(5).to_pandas_dataframe()" ] }, { @@ -244,19 +239,18 @@ "outputs": [], "source": [ "automl_settings = {\n", - " \"iteration_timeout_minutes\" : 5,\n", - " \"iterations\" : 2,\n", - " \"primary_metric\" : 'AUC_weighted',\n", - " \"preprocess\" : True,\n", - " \"verbosity\" : logging.INFO\n", + " \"experiment_timeout_minutes\": 20,\n", + " \"max_concurrent_iterations\": 4,\n", + " \"primary_metric\" : 'AUC_weighted'\n", "}\n", - "automl_config = AutoMLConfig(task = 'classification',\n", - " debug_log = 'automl_errors.log',\n", + "automl_config = AutoMLConfig(compute_target=compute_target,\n", + " task = \"classification\",\n", + " training_data=dataset,\n", + " label_column_name=\"Primary Type\", \n", " path = project_folder,\n", - " compute_target=compute_target,\n", - " run_configuration=conda_run_config,\n", - " X = X,\n", - " y = y,\n", + " enable_early_stopping= True,\n", + " featurization= 'auto',\n", + " debug_log = \"automl_errors.log\",\n", " **automl_settings\n", " )" ] @@ -265,6 +259,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "#### Create Pipeline and AutoMLStep\n", + "\n", "You can define outputs for the AutoMLStep using TrainingOutput." ] }, @@ -300,7 +296,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "automlstep-remarks-sample1" + ] + }, "outputs": [], "source": [ "automl_step = AutoMLStep(\n", @@ -313,7 +313,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [ + "automlstep-remarks-sample2" + ] + }, "outputs": [], "source": [ "from azureml.pipeline.core import Pipeline\n", @@ -378,8 +382,8 @@ "outputs": [], "source": [ "import json\n", - "with open(metrics_output._path_on_datastore) as f: \n", - " metrics_output_result = f.read()\n", + "with open(metrics_output._path_on_datastore) as f:\n", + " metrics_output_result = f.read()\n", " \n", "deserialized_metrics_output = json.loads(metrics_output_result)\n", "df = pd.DataFrame(deserialized_metrics_output)\n", @@ -399,6 +403,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Retrieve best model from Pipeline Run\n", "best_model_output = pipeline_run.get_pipeline_output(best_model_output_name)\n", "num_file_downloaded = best_model_output.download('.', show_progress=True)" ] @@ -416,6 +421,15 @@ "best_model" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_model.steps" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -431,11 +445,11 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = Dataset.Tabular.from_delimited_files(path='https://dprepdata.blob.core.windows.net/demo/crime0-test.csv')\n", + "dataset_test = Dataset.Tabular.from_delimited_files(path='https://dprepdata.blob.core.windows.net/demo/crime0-test.csv')\n", "df_test = dataset_test.to_pandas_dataframe()\n", - "df_test = df_test[pd.notnull(df['Primary Type'])]\n", + "df_test = df_test[pd.notnull(df_test['Primary Type'])]\n", "\n", - "y_test = df_test[['Primary Type']]\n", + "y_test = df_test['Primary Type']\n", "X_test = df_test.drop(['Primary Type', 'FBI Code'], axis=1)" ] }, @@ -454,15 +468,19 @@ "metadata": {}, "outputs": [], "source": [ - "from pandas_ml import ConfusionMatrix\n", - "\n", + "from sklearn.metrics import confusion_matrix\n", "ypred = best_model.predict(X_test)\n", - "\n", - "cm = ConfusionMatrix(y_test['Primary Type'], ypred)\n", - "\n", - "print(cm)\n", - "\n", - "cm.plot()" + "cm = confusion_matrix(y_test, ypred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize the confusion matrix\n", + "pd.DataFrame(cm).style.background_gradient(cmap='Blues', low=0, high=0.9)" ] } ], diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb index 727de8865..aace1bf8a 100644 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb +++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.ipynb @@ -16,16 +16,12 @@ "\n", "You can combine the two part tutorial into one using AzureML Pipelines as Pipelines provide a way to stitch together various steps involved (like data preparation and training in this case) in a machine learning workflow.\n", "\n", - "In this notebook, you learn how to prepare data for regression modeling by using the [Azure Machine Learning Data Prep SDK](https://aka.ms/data-prep-sdk) for Python. You run various transformations to filter and combine two different NYC taxi data sets. Once you prepare the NYC taxi data for regression modeling, then you will use [AutoMLStep](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlstep?view=azure-ml-py) available with [Azure Machine Learning Pipelines](https://aka.ms/aml-pipelines) to define your machine learning goals and constraints as well as to launch the automated machine learning process. The automated machine learning technique iterates over many combinations of algorithms and hyperparameters until it finds the best model based on your criterion.\n", + "In this notebook, you learn how to prepare data for regression modeling by using open source library [pandas](https://pandas.pydata.org/). You run various transformations to filter and combine two different NYC taxi datasets. Once you prepare the NYC taxi data for regression modeling, then you will use [AutoMLStep](https://docs.microsoft.com/python/api/azureml-train-automl-runtime/azureml.train.automl.runtime.automl_step.automlstep?view=azure-ml-py) available with [Azure Machine Learning Pipelines](https://aka.ms/aml-pipelines) to define your machine learning goals and constraints as well as to launch the automated machine learning process. The automated machine learning technique iterates over many combinations of algorithms and hyperparameters until it finds the best model based on your criterion.\n", "\n", "After you complete building the model, you can predict the cost of a taxi trip by training a model on data features. These features include the pickup day and time, the number of passengers, and the pickup location.\n", "\n", "## Prerequisite\n", - "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the configuration Notebook located at https://github.com/Azure/MachineLearningNotebooks first if you haven't. This sets you up with a working config file that has information on your workspace, subscription id, etc.\n", - "\n", - "We will run various transformations to filter and combine two different NYC taxi data sets. We will use DataPrep SDK for this preparing data. \n", - "\n", - "Perform `pip install azureml-dataprep` if you have't already done so." + "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, make sure you go through the configuration Notebook located at https://github.com/Azure/MachineLearningNotebooks first if you haven't. This sets you up with a working config file that has information on your workspace, subscription id, etc." ] }, { @@ -108,7 +104,6 @@ "metadata": {}, "outputs": [], "source": [ - "import azureml.dataprep as dprep\n", "from IPython.display import display\n", "\n", "display(green_df_raw.head(5))\n", @@ -144,8 +139,8 @@ "if not os.path.exists(yelloDir):\n", " os.mkdir(yelloDir)\n", " \n", - "greenTaxiData = greenDir + \"/part-00000\"\n", - "yellowTaxiData = yelloDir + \"/part-00000\"\n", + "greenTaxiData = greenDir + \"/unprepared.parquet\"\n", + "yellowTaxiData = yelloDir + \"/unprepared.parquet\"\n", "\n", "green_df_raw.to_csv(greenTaxiData, index=False)\n", "yellow_df_raw.to_csv(yellowTaxiData, index=False)\n", @@ -169,17 +164,54 @@ "\n", "default_store.upload_files([greenTaxiData], \n", " target_path = 'green', \n", - " overwrite = False, \n", + " overwrite = True, \n", " show_progress = True)\n", "\n", "default_store.upload_files([yellowTaxiData], \n", " target_path = 'yellow', \n", - " overwrite = False, \n", + " overwrite = True, \n", " show_progress = True)\n", "\n", "print(\"Upload calls completed.\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create and register datasets\n", + "\n", + "By creating a dataset, you create a reference to the data source location. If you applied any subsetting transformations to the dataset, they will be stored in the dataset as well. You can learn more about the what subsetting capabilities are supported by referring to [our documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.tabular_dataset.tabulardataset?view=azure-ml-py#remarks). The data remains in its existing location, so no extra storage cost is incurred." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Dataset\n", + "green_taxi_data = Dataset.Tabular.from_delimited_files(default_store.path('green/unprepared.parquet'))\n", + "yellow_taxi_data = Dataset.Tabular.from_delimited_files(default_store.path('yellow/unprepared.parquet'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Register the taxi datasets with the workspace so that you can reuse them in other experiments or share with your colleagues who have access to your workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "green_taxi_data = green_taxi_data.register(ws, 'green_taxi_data')\n", + "yellow_taxi_data = yellow_taxi_data.register(ws, 'yellow_taxi_data')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -194,20 +226,22 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.core.compute import AmlCompute\n", - "from azureml.core.compute import ComputeTarget\n", + "from azureml.core.compute import ComputeTarget, AmlCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", "\n", - "aml_compute = ws.get_default_compute_target(\"CPU\")\n", + "# Choose a name for your CPU cluster\n", + "amlcompute_cluster_name = \"cpu-cluster\"\n", "\n", - "if aml_compute is None:\n", - " amlcompute_cluster_name = \"cpu-cluster\"\n", - " provisioning_config = AmlCompute.provisioning_configuration(vm_size = \"STANDARD_D2_V2\",\n", - " max_nodes = 4)\n", + "# Verify that cluster does not exist already\n", + "try:\n", + " aml_compute = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)\n", + " print('Found existing cluster, use it.')\n", + "except ComputeTargetException:\n", + " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',\n", + " max_nodes=4)\n", + " aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)\n", "\n", - " aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)\n", - " aml_compute.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)\n", - "\n", - "aml_compute" + "aml_compute.wait_for_completion(show_output=True)" ] }, { @@ -215,7 +249,7 @@ "metadata": {}, "source": [ "#### Define RunConfig for the compute\n", - "We need `azureml-dataprep` SDK for all the steps below. We will also use `pandas`, `scikit-learn` and `automl` for the training step. Defining the `runconfig` for that." + "We will also use `pandas`, `scikit-learn` and `automl`, `pyarrow` for the pipeline steps. Defining the `runconfig` for that." ] }, { @@ -242,13 +276,10 @@ "# Use conda_dependencies.yml to create a conda environment in the Docker image for execution\n", "aml_run_config.environment.python.user_managed_dependencies = False\n", "\n", - "# Auto-prepare the Docker image when used for execution (if it is not already prepared)\n", - "aml_run_config.auto_prepare_environment = True\n", - "\n", "# Specify CondaDependencies obj, add necessary packages\n", "aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(\n", " conda_packages=['pandas','scikit-learn'], \n", - " pip_packages=['azureml-sdk', 'azureml-dataprep', 'azureml-train-automl'], \n", + " pip_packages=['azureml-sdk[automl,explain]', 'pyarrow'], \n", " pin_sdk_version=False)\n", "\n", "print (\"Run configuration created.\")" @@ -259,7 +290,7 @@ "metadata": {}, "source": [ "### Prepare data\n", - "Now we will prepare for regression modeling by using the `Azure Machine Learning Data Prep SDK for Python`. We run various transformations to filter and combine two different NYC taxi data sets.\n", + "Now we will prepare for regression modeling by using `pandas`. We run various transformations to filter and combine two different NYC taxi datasets.\n", "\n", "We achieve this by creating a separate step for each transformation as this allows us to reuse the steps and saves us from running all over again in case of any change. We will keep data preparation scripts in one subfolder and training scripts in another.\n", "\n", @@ -270,7 +301,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Define Useful Colums\n", + "#### Define Useful Columns\n", "Here we are defining a set of \"useful\" columns for both Green and Yellow taxi data." ] }, @@ -304,18 +335,12 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.data.data_reference import DataReference \n", "from azureml.pipeline.core import PipelineData\n", "from azureml.pipeline.steps import PythonScriptStep\n", "\n", "# python scripts folder\n", "prepare_data_folder = './scripts/prepdata'\n", "\n", - "blob_green_data = DataReference(\n", - " datastore=default_store,\n", - " data_reference_name=\"green_taxi_data\",\n", - " path_on_datastore=\"green/part-00000\")\n", - "\n", "# rename columns as per Azure Machine Learning NYC Taxi tutorial\n", "green_columns = str({ \n", " \"vendorID\": \"vendor\",\n", @@ -332,7 +357,7 @@ "}).replace(\",\", \";\")\n", "\n", "# Define output after cleansing step\n", - "cleansed_green_data = PipelineData(\"green_taxi_data\", datastore=default_store)\n", + "cleansed_green_data = PipelineData(\"cleansed_green_data\", datastore=default_store).as_dataset()\n", "\n", "print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n", "\n", @@ -341,11 +366,10 @@ "cleansingStepGreen = PythonScriptStep(\n", " name=\"Cleanse Green Taxi Data\",\n", " script_name=\"cleanse.py\", \n", - " arguments=[\"--input_cleanse\", blob_green_data, \n", - " \"--useful_columns\", useful_columns,\n", + " arguments=[\"--useful_columns\", useful_columns,\n", " \"--columns\", green_columns,\n", " \"--output_cleanse\", cleansed_green_data],\n", - " inputs=[blob_green_data],\n", + " inputs=[green_taxi_data.as_named_input('raw_data')],\n", " outputs=[cleansed_green_data],\n", " compute_target=aml_compute,\n", " runconfig=aml_run_config,\n", @@ -369,11 +393,6 @@ "metadata": {}, "outputs": [], "source": [ - "blob_yellow_data = DataReference(\n", - " datastore=default_store,\n", - " data_reference_name=\"yellow_taxi_data\",\n", - " path_on_datastore=\"yellow/part-00000\")\n", - "\n", "yellow_columns = str({\n", " \"vendorID\": \"vendor\",\n", " \"tpepPickupDateTime\": \"pickup_datetime\",\n", @@ -389,7 +408,7 @@ "}).replace(\",\", \";\")\n", "\n", "# Define output after cleansing step\n", - "cleansed_yellow_data = PipelineData(\"yellow_taxi_data\", datastore=default_store)\n", + "cleansed_yellow_data = PipelineData(\"cleansed_yellow_data\", datastore=default_store).as_dataset()\n", "\n", "print('Cleanse script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n", "\n", @@ -398,11 +417,10 @@ "cleansingStepYellow = PythonScriptStep(\n", " name=\"Cleanse Yellow Taxi Data\",\n", " script_name=\"cleanse.py\", \n", - " arguments=[\"--input_cleanse\", blob_yellow_data, \n", - " \"--useful_columns\", useful_columns,\n", + " arguments=[\"--useful_columns\", useful_columns,\n", " \"--columns\", yellow_columns,\n", " \"--output_cleanse\", cleansed_yellow_data],\n", - " inputs=[blob_yellow_data],\n", + " inputs=[yellow_taxi_data.as_named_input('raw_data')],\n", " outputs=[cleansed_yellow_data],\n", " compute_target=aml_compute,\n", " runconfig=aml_run_config,\n", @@ -428,7 +446,7 @@ "outputs": [], "source": [ "# Define output after merging step\n", - "merged_data = PipelineData(\"merged_data\", datastore=default_store)\n", + "merged_data = PipelineData(\"merged_data\", datastore=default_store).as_dataset()\n", "\n", "print('Merge script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n", "\n", @@ -437,10 +455,9 @@ "mergingStep = PythonScriptStep(\n", " name=\"Merge Taxi Data\",\n", " script_name=\"merge.py\", \n", - " arguments=[\"--input_green_merge\", cleansed_green_data, \n", - " \"--input_yellow_merge\", cleansed_yellow_data,\n", - " \"--output_merge\", merged_data],\n", - " inputs=[cleansed_green_data, cleansed_yellow_data],\n", + " arguments=[\"--output_merge\", merged_data],\n", + " inputs=[cleansed_green_data.parse_parquet_files(file_extension=None),\n", + " cleansed_yellow_data.parse_parquet_files(file_extension=None)],\n", " outputs=[merged_data],\n", " compute_target=aml_compute,\n", " runconfig=aml_run_config,\n", @@ -466,7 +483,7 @@ "outputs": [], "source": [ "# Define output after merging step\n", - "filtered_data = PipelineData(\"filtered_data\", datastore=default_store)\n", + "filtered_data = PipelineData(\"filtered_data\", datastore=default_store).as_dataset()\n", "\n", "print('Filter script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n", "\n", @@ -475,9 +492,8 @@ "filterStep = PythonScriptStep(\n", " name=\"Filter Taxi Data\",\n", " script_name=\"filter.py\", \n", - " arguments=[\"--input_filter\", merged_data, \n", - " \"--output_filter\", filtered_data],\n", - " inputs=[merged_data],\n", + " arguments=[\"--output_filter\", filtered_data],\n", + " inputs=[merged_data.parse_parquet_files(file_extension=None)],\n", " outputs=[filtered_data],\n", " compute_target=aml_compute,\n", " runconfig = aml_run_config,\n", @@ -503,7 +519,7 @@ "outputs": [], "source": [ "# Define output after normalize step\n", - "normalized_data = PipelineData(\"normalized_data\", datastore=default_store)\n", + "normalized_data = PipelineData(\"normalized_data\", datastore=default_store).as_dataset()\n", "\n", "print('Normalize script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n", "\n", @@ -512,9 +528,8 @@ "normalizeStep = PythonScriptStep(\n", " name=\"Normalize Taxi Data\",\n", " script_name=\"normalize.py\", \n", - " arguments=[\"--input_normalize\", filtered_data, \n", - " \"--output_normalize\", normalized_data],\n", - " inputs=[filtered_data],\n", + " arguments=[\"--output_normalize\", normalized_data],\n", + " inputs=[filtered_data.parse_parquet_files(file_extension=None)],\n", " outputs=[normalized_data],\n", " compute_target=aml_compute,\n", " runconfig = aml_run_config,\n", @@ -544,8 +559,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Define output after transforme step\n", - "transformed_data = PipelineData(\"transformed_data\", datastore=default_store)\n", + "# Define output after transform step\n", + "transformed_data = PipelineData(\"transformed_data\", datastore=default_store).as_dataset()\n", "\n", "print('Transform script is in {}.'.format(os.path.realpath(prepare_data_folder)))\n", "\n", @@ -554,9 +569,8 @@ "transformStep = PythonScriptStep(\n", " name=\"Transform Taxi Data\",\n", " script_name=\"transform.py\", \n", - " arguments=[\"--input_transform\", normalized_data,\n", - " \"--output_transform\", transformed_data],\n", - " inputs=[normalized_data],\n", + " arguments=[\"--output_transform\", transformed_data],\n", + " inputs=[normalized_data.parse_parquet_files(file_extension=None)],\n", " outputs=[transformed_data],\n", " compute_target=aml_compute,\n", " runconfig = aml_run_config,\n", @@ -571,8 +585,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Extract features\n", - "Add the following columns to be features for our model creation. The prediction value will be *cost*." + "### Split the data into train and test sets\n", + "This function segregates the data into dataset for model training and dataset for testing." ] }, { @@ -581,92 +595,11 @@ "metadata": {}, "outputs": [], "source": [ - "feature_columns = str(['pickup_weekday','pickup_hour', 'distance','passengers', 'vendor']).replace(\",\", \";\")\n", - "\n", "train_model_folder = './scripts/trainmodel'\n", "\n", - "print('Extract script is in {}.'.format(os.path.realpath(train_model_folder)))\n", - "\n", - "# features data after transform step\n", - "features_data = PipelineData(\"features_data\", datastore=default_store)\n", - "\n", - "# featurization step creation\n", - "# See the featurization.py for details about input and output\n", - "featurizationStep = PythonScriptStep(\n", - " name=\"Extract Features\",\n", - " script_name=\"featurization.py\", \n", - " arguments=[\"--input_featurization\", transformed_data, \n", - " \"--useful_columns\", feature_columns,\n", - " \"--output_featurization\", features_data],\n", - " inputs=[transformed_data],\n", - " outputs=[features_data],\n", - " compute_target=aml_compute,\n", - " runconfig = aml_run_config,\n", - " source_directory=train_model_folder,\n", - " allow_reuse=True\n", - ")\n", - "\n", - "print(\"featurizationStep created.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Extract label" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "label_columns = str(['cost']).replace(\",\", \";\")\n", - "\n", - "# label data after transform step\n", - "label_data = PipelineData(\"label_data\", datastore=default_store)\n", - "\n", - "print('Extract script is in {}.'.format(os.path.realpath(train_model_folder)))\n", - "\n", - "# label step creation\n", - "# See the featurization.py for details about input and output\n", - "labelStep = PythonScriptStep(\n", - " name=\"Extract Labels\",\n", - " script_name=\"featurization.py\", \n", - " arguments=[\"--input_featurization\", transformed_data, \n", - " \"--useful_columns\", label_columns,\n", - " \"--output_featurization\", label_data],\n", - " inputs=[transformed_data],\n", - " outputs=[label_data],\n", - " compute_target=aml_compute,\n", - " runconfig = aml_run_config,\n", - " source_directory=train_model_folder,\n", - " allow_reuse=True\n", - ")\n", - "\n", - "print(\"labelStep created.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Split the data into train and test sets\n", - "This function segregates the data into the **x**, features, dataset for model training and **y**, values to predict, dataset for testing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ "# train and test splits output\n", - "output_split_train_x = PipelineData(\"output_split_train_x\", datastore=default_store)\n", - "output_split_train_y = PipelineData(\"output_split_train_y\", datastore=default_store)\n", - "output_split_test_x = PipelineData(\"output_split_test_x\", datastore=default_store)\n", - "output_split_test_y = PipelineData(\"output_split_test_y\", datastore=default_store)\n", + "output_split_train = PipelineData(\"output_split_train\", datastore=default_store).as_dataset()\n", + "output_split_test = PipelineData(\"output_split_test\", datastore=default_store).as_dataset()\n", "\n", "print('Data spilt script is in {}.'.format(os.path.realpath(train_model_folder)))\n", "\n", @@ -675,14 +608,10 @@ "testTrainSplitStep = PythonScriptStep(\n", " name=\"Train Test Data Split\",\n", " script_name=\"train_test_split.py\", \n", - " arguments=[\"--input_split_features\", features_data, \n", - " \"--input_split_labels\", label_data,\n", - " \"--output_split_train_x\", output_split_train_x,\n", - " \"--output_split_train_y\", output_split_train_y,\n", - " \"--output_split_test_x\", output_split_test_x,\n", - " \"--output_split_test_y\", output_split_test_y],\n", - " inputs=[features_data, label_data],\n", - " outputs=[output_split_train_x, output_split_train_y, output_split_test_x, output_split_test_y],\n", + " arguments=[\"--output_split_train\", output_split_train,\n", + " \"--output_split_test\", output_split_test],\n", + " inputs=[transformed_data.parse_parquet_files(file_extension=None)],\n", + " outputs=[output_split_train, output_split_test],\n", " compute_target=aml_compute,\n", " runconfig = aml_run_config,\n", " source_directory=train_model_folder,\n", @@ -697,7 +626,7 @@ "metadata": {}, "source": [ "## Use automated machine learning to build regression model\n", - "Now we will use **automated machine learning** to build the regression model. We will use [AutoMLStep](https://docs.microsoft.com/en-us/python/api/azureml-train-automl/azureml.train.automl.automlstep?view=azure-ml-py) in AML Pipelines for this part. These functions use various features from the data set and allow an automated model to build relationships between the features and the price of a taxi trip." + "Now we will use **automated machine learning** to build the regression model. We will use [AutoMLStep](https://docs.microsoft.com/python/api/azureml-train-automl-runtime/azureml.train.automl.runtime.automl_step.automlstep?view=azure-ml-py) in AML Pipelines for this part. Perform `pip install azureml-sdk[automl]`to get the automated machine learning package. These functions use various features from the data set and allow an automated model to build relationships between the features and the price of a taxi trip." ] }, { @@ -727,52 +656,13 @@ "print(\"Experiment created\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Create get_data script\n", - "\n", - "A script with `get_data()` function is necessary to fetch training features(X) and labels(Y) on remote compute, from input data. Here we use mounted path of `train_test_split` step to get the x and y train values. They are added as environment variable on compute machine by default\n", - "\n", - "Note: Every DataReference are added as environment variable on compute machine since the defualt mode is mount" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('get_data.py will be written to {}.'.format(os.path.realpath(train_model_folder)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile $train_model_folder/get_data.py\n", - "import os\n", - "import pandas as pd\n", - "\n", - "def get_data():\n", - " print(\"In get_data\")\n", - " print(os.environ['AZUREML_DATAREFERENCE_output_split_train_x'])\n", - " X_train = pd.read_csv(os.environ['AZUREML_DATAREFERENCE_output_split_train_x'] + \"/part-00000\", header=0)\n", - " y_train = pd.read_csv(os.environ['AZUREML_DATAREFERENCE_output_split_train_y'] + \"/part-00000\", header=0)\n", - " \n", - " return { \"X\" : X_train.values, \"y\" : y_train.values.flatten() }" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Define settings for autogeneration and tuning\n", "\n", - "Here we define the experiment parameter and model settings for autogeneration and tuning. We can specify automl_settings as **kwargs as well. Also note that we have to use a get_data() function for remote excutions. See get_data script for more details.\n", + "Here we define the experiment parameter and model settings for autogeneration and tuning. We can specify automl_settings as **kwargs as well.\n", "\n", "Use your defined training settings as a parameter to an `AutoMLConfig` object. Additionally, specify your training data and the type of model, which is `regression` in this case.\n", "\n", @@ -793,17 +683,20 @@ " \"iteration_timeout_minutes\" : 10,\n", " \"iterations\" : 2,\n", " \"primary_metric\" : 'spearman_correlation',\n", - " \"preprocess\" : True,\n", - " \"verbosity\" : logging.INFO,\n", " \"n_cross_validations\": 5\n", "}\n", "\n", + "train_X = output_split_train.parse_parquet_files(file_extension=None).keep_columns(['pickup_weekday','pickup_hour', 'distance','passengers', 'vendor'])\n", + "train_y = output_split_train.parse_parquet_files(file_extension=None).keep_columns('cost')\n", + "\n", "automl_config = AutoMLConfig(task = 'regression',\n", " debug_log = 'automated_ml_errors.log',\n", " path = train_model_folder,\n", - " compute_target=aml_compute,\n", - " run_configuration=aml_run_config,\n", - " data_script = train_model_folder + \"/get_data.py\",\n", + " compute_target = aml_compute,\n", + " run_configuration = aml_run_config,\n", + " featurization = 'auto',\n", + " X = train_X,\n", + " y = train_y,\n", " **automl_settings)\n", " \n", "print(\"AutoML config created.\")" @@ -822,15 +715,12 @@ "metadata": {}, "outputs": [], "source": [ - "from azureml.train.automl.runtime import AutoMLStep\n", - "\n", - "trainWithAutomlStep = AutoMLStep(\n", - " name='AutoML_Regression',\n", - " automl_config=automl_config,\n", - " inputs=[output_split_train_x, output_split_train_y],\n", - " allow_reuse=True,\n", - " hash_paths=[os.path.realpath(train_model_folder)])\n", + "from azureml.pipeline.steps import AutoMLStep\n", "\n", + "trainWithAutomlStep = AutoMLStep(name='AutoML_Regression',\n", + " automl_config=automl_config,\n", + " passthru_automl_config=False,\n", + " allow_reuse=True)\n", "print(\"trainWithAutomlStep created.\")" ] }, @@ -892,12 +782,11 @@ " return path\n", "\n", "def fetch_df(step, output_name):\n", - " output_data = step.get_output_data(output_name)\n", - " \n", + " output_data = step.get_output_data(output_name) \n", " download_path = './outputs/' + output_name\n", - " output_data.download(download_path)\n", - " df_path = get_download_path(download_path, output_name) + '/part-00000'\n", - " return dprep.auto_read_file(path=df_path)" + " output_data.download(download_path, overwrite=True)\n", + " df_path = get_download_path(download_path, output_name) + '/processed.parquet'\n", + " return pd.read_parquet(df_path)" ] }, { @@ -939,7 +828,7 @@ "merge_step = pipeline_run.find_step_run(mergingStep.name)[0]\n", "combined_df = fetch_df(merge_step, merged_data.name)\n", "\n", - "display(combined_df.get_profile())" + "display(combined_df.describe())" ] }, { @@ -958,7 +847,7 @@ "filter_step = pipeline_run.find_step_run(filterStep.name)[0]\n", "filtered_df = fetch_df(filter_step, filtered_data.name)\n", "\n", - "display(filtered_df.get_profile())" + "display(filtered_df.describe())" ] }, { @@ -996,7 +885,7 @@ "transform_step = pipeline_run.find_step_run(transformStep.name)[0]\n", "transformed_df = fetch_df(transform_step, transformed_data.name)\n", "\n", - "display(transformed_df.get_profile())\n", + "display(transformed_df.describe())\n", "display(transformed_df.head(5))" ] }, @@ -1014,16 +903,10 @@ "outputs": [], "source": [ "split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]\n", - "train_split_x = fetch_df(split_step, output_split_train_x.name)\n", - "train_split_y = fetch_df(split_step, output_split_train_y.name)\n", - "\n", - "display_x_train = train_split_x.keep_columns(columns=[\"vendor\", \"pickup_weekday\", \"pickup_hour\", \"passengers\", \"distance\"])\n", - "display_y_train = train_split_y.rename_columns(column_pairs={\"Column1\": \"cost\"})\n", + "train_split = fetch_df(split_step, output_split_train.name)\n", "\n", - "display(display_x_train.get_profile())\n", - "display(display_x_train.head(5))\n", - "display(display_y_train.get_profile())\n", - "display(display_y_train.head(5))" + "display(train_split.describe())\n", + "display(train_split.head(5))" ] }, { @@ -1125,14 +1008,11 @@ "source": [ "# split_step = pipeline_run.find_step_run(testTrainSplitStep.name)[0]\n", "\n", - "# x_test = fetch_df(split_step, output_split_test_x.name)\n", - "# y_test = fetch_df(split_step, output_split_test_y.name)\n", - "\n", - "# display(x_test.keep_columns(columns=[\"vendor\", \"pickup_weekday\", \"pickup_hour\", \"passengers\", \"distance\"]).head(5))\n", - "# display(y_test.rename_columns(column_pairs={\"Column1\": \"cost\"}).head(5))\n", + "# x_test = fetch_df(split_step, output_split_test.name)[['distance','passengers', 'vendor','pickup_weekday','pickup_hour']]\n", + "# y_test = fetch_df(split_step, output_split_test.name)[['cost']]\n", "\n", - "# x_test = x_test.to_pandas_dataframe()\n", - "# y_test = y_test.to_pandas_dataframe()" + "# display(x_test.head(5))\n", + "# display(y_test.head(5))" ] }, { @@ -1150,9 +1030,9 @@ "metadata": {}, "outputs": [], "source": [ - "# y_predict = fitted_model.predict(x_test.values)\n", + "# y_predict = fitted_model.predict(x_test)\n", "\n", - "# y_actual = y_test.iloc[:,0].values.tolist()\n", + "# y_actual = y_test.values.tolist()\n", "\n", "# display(pd.DataFrame({'Actual':y_actual, 'Predicted':y_predict}).head(5))" ] @@ -1168,7 +1048,7 @@ "# fig = plt.figure(figsize=(14, 10))\n", "# ax1 = fig.add_subplot(111)\n", "\n", - "# distance_vals = [x[4] for x in x_test.values]\n", + "# distance_vals = [x[0] for x in x_test.values]\n", "\n", "# ax1.scatter(distance_vals[:100], y_predict[:100], s=18, c='b', marker=\"s\", label='Predicted')\n", "# ax1.scatter(distance_vals[:100], y_actual[:100], s=18, c='r', marker=\"o\", label='Actual')\n", @@ -1204,7 +1084,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.yml b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.yml index dcdee6963..12b58a211 100644 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.yml +++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/nyc-taxi-data-regression-model-building.yml @@ -4,6 +4,7 @@ dependencies: - azureml-sdk - azureml-widgets - azureml-opendatasets - - azureml-dataprep - azureml-train-automl - matplotlib + - pandas + - pyarrow diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/cleanse.py b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/cleanse.py index 0b8c4143a..bae27e828 100644 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/cleanse.py +++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/cleanse.py @@ -3,15 +3,14 @@ import argparse import os -import pandas as pd -import azureml.dataprep as dprep +from azureml.core import Run def get_dict(dict_str): pairs = dict_str.strip("{}").split("\;") new_dict = {} for pair in pairs: - key, value = pair.strip('\\').split(":") + key, value = pair.strip().split(":") new_dict[key.strip().strip("'")] = value.strip().strip("'") return new_dict @@ -19,40 +18,37 @@ def get_dict(dict_str): print("Cleans the input data") +# Get the input green_taxi_data. To learn more about how to access dataset in your script, please +# see https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets. +run = Run.get_context() +raw_data = run.input_datasets["raw_data"] + + parser = argparse.ArgumentParser("cleanse") -parser.add_argument("--input_cleanse", type=str, help="raw taxi data") parser.add_argument("--output_cleanse", type=str, help="cleaned taxi data directory") parser.add_argument("--useful_columns", type=str, help="useful columns to keep") parser.add_argument("--columns", type=str, help="rename column pattern") args = parser.parse_args() -print("Argument 1(input taxi data path): %s" % args.input_cleanse) -print("Argument 2(columns to keep): %s" % str(args.useful_columns.strip("[]").split("\;"))) -print("Argument 3(columns renaming mapping): %s" % str(args.columns.strip("{}").split("\;"))) -print("Argument 4(output cleansed taxi data path): %s" % args.output_cleanse) - -raw_df = dprep.read_csv(path=args.input_cleanse, header=dprep.PromoteHeadersMode.GROUPED) +print("Argument 1(columns to keep): %s" % str(args.useful_columns.strip("[]").split("\;"))) +print("Argument 2(columns renaming mapping): %s" % str(args.columns.strip("{}").split("\;"))) +print("Argument 3(output cleansed taxi data path): %s" % args.output_cleanse) -# These functions ensure that null data is removed from the data set, +# These functions ensure that null data is removed from the dataset, # which will help increase machine learning model accuracy. -# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep -# for more details useful_columns = [s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")] columns = get_dict(args.columns) -all_columns = dprep.ColumnSelector(term=".*", use_regex=True) -drop_if_all_null = [all_columns, dprep.ColumnRelationship(dprep.ColumnRelationship.ALL)] +new_df = (raw_data.to_pandas_dataframe() + .dropna(how='all') + .rename(columns=columns))[useful_columns] -new_df = (raw_df - .replace_na(columns=all_columns) - .drop_nulls(*drop_if_all_null) - .rename_columns(column_pairs=columns) - .keep_columns(columns=useful_columns)) +new_df.reset_index(inplace=True, drop=True) if not (args.output_cleanse is None): os.makedirs(args.output_cleanse, exist_ok=True) print("%s created" % args.output_cleanse) - write_df = new_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_cleanse)) - write_df.run_local() + path = args.output_cleanse + "/processed.parquet" + write_df = new_df.to_parquet(path) diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/filter.py b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/filter.py index a72481859..a999c54ec 100644 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/filter.py +++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/filter.py @@ -1,55 +1,47 @@ import argparse import os -import azureml.dataprep as dprep +from azureml.core import Run print("Filters out coordinates for locations that are outside the city border.", "Chain the column filter commands within the filter() function", "and define the minimum and maximum bounds for each field.") +run = Run.get_context() + +# To learn more about how to access dataset in your script, please +# see https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets. +merged_data = run.input_datasets["merged_data"] +combined_df = merged_data.to_pandas_dataframe() + parser = argparse.ArgumentParser("filter") -parser.add_argument("--input_filter", type=str, help="merged taxi data directory") parser.add_argument("--output_filter", type=str, help="filter out out of city locations") args = parser.parse_args() -print("Argument 1(input taxi data path): %s" % args.input_filter) -print("Argument 2(output filtered taxi data path): %s" % args.output_filter) - -combined_df = dprep.read_csv(args.input_filter + '/part-*') +print("Argument (output filtered taxi data path): %s" % args.output_filter) # These functions filter out coordinates for locations that are outside the city border. -# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details - -# Create a condensed view of the dataflow to just show the lat/long fields, -# which makes it easier to evaluate missing or out-of-scope coordinates -decimal_type = dprep.TypeConverter(data_type=dprep.FieldType.DECIMAL) -combined_df = combined_df.set_column_types(type_conversions={ - "pickup_longitude": decimal_type, - "pickup_latitude": decimal_type, - "dropoff_longitude": decimal_type, - "dropoff_latitude": decimal_type -}) # Filter out coordinates for locations that are outside the city border. # Chain the column filter commands within the filter() function # and define the minimum and maximum bounds for each field -latlong_filtered_df = (combined_df - .drop_nulls(columns=["pickup_longitude", - "pickup_latitude", - "dropoff_longitude", - "dropoff_latitude"], - column_relationship=dprep.ColumnRelationship(dprep.ColumnRelationship.ANY)) - .filter(dprep.f_and(dprep.col("pickup_longitude") <= -73.72, - dprep.col("pickup_longitude") >= -74.09, - dprep.col("pickup_latitude") <= 40.88, - dprep.col("pickup_latitude") >= 40.53, - dprep.col("dropoff_longitude") <= -73.72, - dprep.col("dropoff_longitude") >= -74.09, - dprep.col("dropoff_latitude") <= 40.88, - dprep.col("dropoff_latitude") >= 40.53))) + +combined_df = combined_df.astype({"pickup_longitude": 'float64', "pickup_latitude": 'float64', + "dropoff_longitude": 'float64', "dropoff_latitude": 'float64'}) + +latlong_filtered_df = combined_df[(combined_df.pickup_longitude <= -73.72) & + (combined_df.pickup_longitude >= -74.09) & + (combined_df.pickup_latitude <= 40.88) & + (combined_df.pickup_latitude >= 40.53) & + (combined_df.dropoff_longitude <= -73.72) & + (combined_df.dropoff_longitude >= -74.72) & + (combined_df.dropoff_latitude <= 40.88) & + (combined_df.dropoff_latitude >= 40.53)] + +latlong_filtered_df.reset_index(inplace=True, drop=True) if not (args.output_filter is None): os.makedirs(args.output_filter, exist_ok=True) print("%s created" % args.output_filter) - write_df = latlong_filtered_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_filter)) - write_df.run_local() + path = args.output_filter + "/processed.parquet" + write_df = latlong_filtered_df.to_parquet(path) diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/merge.py b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/merge.py index 4764023aa..bf3c8d936 100644 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/merge.py +++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/merge.py @@ -1,29 +1,30 @@ - import argparse import os -import azureml.dataprep as dprep +from azureml.core import Run print("Merge Green and Yellow taxi data") +run = Run.get_context() + +# To learn more about how to access dataset in your script, please +# see https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets. +cleansed_green_data = run.input_datasets["cleansed_green_data"] +cleansed_yellow_data = run.input_datasets["cleansed_yellow_data"] +green_df = cleansed_green_data.to_pandas_dataframe() +yellow_df = cleansed_yellow_data.to_pandas_dataframe() + parser = argparse.ArgumentParser("merge") -parser.add_argument("--input_green_merge", type=str, help="cleaned green taxi data directory") -parser.add_argument("--input_yellow_merge", type=str, help="cleaned yellow taxi data directory") parser.add_argument("--output_merge", type=str, help="green and yellow taxi data merged") args = parser.parse_args() - -print("Argument 1(input green taxi data path): %s" % args.input_green_merge) -print("Argument 2(input yellow taxi data path): %s" % args.input_yellow_merge) -print("Argument 3(output merge taxi data path): %s" % args.output_merge) - -green_df = dprep.read_csv(args.input_green_merge + '/part-*') -yellow_df = dprep.read_csv(args.input_yellow_merge + '/part-*') +print("Argument (output merge taxi data path): %s" % args.output_merge) # Appending yellow data to green data -combined_df = green_df.append_rows([yellow_df]) +combined_df = green_df.append(yellow_df, ignore_index=True) +combined_df.reset_index(inplace=True, drop=True) if not (args.output_merge is None): os.makedirs(args.output_merge, exist_ok=True) print("%s created" % args.output_merge) - write_df = combined_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_merge)) - write_df.run_local() + path = args.output_merge + "/processed.parquet" + write_df = combined_df.to_parquet(path) diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/normalize.py b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/normalize.py index f7b384d12..589fd2976 100644 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/normalize.py +++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/normalize.py @@ -1,47 +1,48 @@ import argparse import os -import azureml.dataprep as dprep +import pandas as pd +from azureml.core import Run print("Replace undefined values to relavant values and rename columns to meaningful names") +run = Run.get_context() + +# To learn more about how to access dataset in your script, please +# see https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets. +filtered_data = run.input_datasets['filtered_data'] +combined_converted_df = filtered_data.to_pandas_dataframe() + parser = argparse.ArgumentParser("normalize") -parser.add_argument("--input_normalize", type=str, help="combined and converted taxi data") parser.add_argument("--output_normalize", type=str, help="replaced undefined values and renamed columns") args = parser.parse_args() -print("Argument 1(input taxi data path): %s" % args.input_normalize) -print("Argument 2(output normalized taxi data path): %s" % args.output_normalize) - -combined_converted_df = dprep.read_csv(args.input_normalize + '/part-*') +print("Argument (output normalized taxi data path): %s" % args.output_normalize) # These functions replace undefined values and rename to use meaningful names. -# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details +replaced_stfor_vals_df = (combined_converted_df.replace({"store_forward": "0"}, {"store_forward": "N"}) + .fillna({"store_forward": "N"})) + +replaced_distance_vals_df = (replaced_stfor_vals_df.replace({"distance": ".00"}, {"distance": 0}) + .fillna({"distance": 0})) -replaced_stfor_vals_df = combined_converted_df.replace(columns="store_forward", - find="0", - replace_with="N").fill_nulls("store_forward", "N") +normalized_df = replaced_distance_vals_df.astype({"distance": 'float64'}) -replaced_distance_vals_df = replaced_stfor_vals_df.replace(columns="distance", - find=".00", - replace_with=0).fill_nulls("distance", 0) +temp = pd.DatetimeIndex(normalized_df["pickup_datetime"]) +normalized_df["pickup_date"] = temp.date +normalized_df["pickup_time"] = temp.time -replaced_distance_vals_df = replaced_distance_vals_df.to_number(["distance"]) +temp = pd.DatetimeIndex(normalized_df["dropoff_datetime"]) +normalized_df["dropoff_date"] = temp.date +normalized_df["dropoff_time"] = temp.time -time_split_df = (replaced_distance_vals_df - .split_column_by_example(source_column="pickup_datetime") - .split_column_by_example(source_column="dropoff_datetime")) +del normalized_df["pickup_datetime"] +del normalized_df["dropoff_datetime"] -# Split the pickup and dropoff datetime values into the respective date and time columns -renamed_col_df = (time_split_df - .rename_columns(column_pairs={ - "pickup_datetime_1": "pickup_date", - "pickup_datetime_2": "pickup_time", - "dropoff_datetime_1": "dropoff_date", - "dropoff_datetime_2": "dropoff_time"})) +normalized_df.reset_index(inplace=True, drop=True) if not (args.output_normalize is None): os.makedirs(args.output_normalize, exist_ok=True) print("%s created" % args.output_normalize) - write_df = renamed_col_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_normalize)) - write_df.run_local() + path = args.output_normalize + "/processed.parquet" + write_df = normalized_df.to_parquet(path) diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/transform.py b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/transform.py index c2ac6e95e..5584d6aba 100644 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/transform.py +++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/prepdata/transform.py @@ -1,22 +1,24 @@ import argparse import os -import azureml.dataprep as dprep +from azureml.core import Run print("Transforms the renamed taxi data to the required format") +run = Run.get_context() + +# To learn more about how to access dataset in your script, please +# see https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-with-datasets. +normalized_data = run.input_datasets['normalized_data'] +normalized_df = normalized_data.to_pandas_dataframe() + parser = argparse.ArgumentParser("transform") -parser.add_argument("--input_transform", type=str, help="renamed taxi data") parser.add_argument("--output_transform", type=str, help="transformed taxi data") args = parser.parse_args() -print("Argument 1(input taxi data path): %s" % args.input_transform) print("Argument 2(output final transformed taxi data): %s" % args.output_transform) -renamed_df = dprep.read_csv(args.input_transform + '/part-*') - # These functions transform the renamed data to be used finally for training. -# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-data-prep for more details # Split the pickup and dropoff date further into the day of the week, day of the month, and month values. # To get the day of the week value, use the derive_column_by_example() function. @@ -27,62 +29,46 @@ # use the drop_columns() function to delete the original fields as the newly generated features are preferred. # Rename the rest of the fields to use meaningful descriptions. -transformed_features_df = (renamed_df - .derive_column_by_example( - source_columns="pickup_date", - new_column_name="pickup_weekday", - example_data=[("2009-01-04", "Sunday"), ("2013-08-22", "Thursday")]) - .derive_column_by_example( - source_columns="dropoff_date", - new_column_name="dropoff_weekday", - example_data=[("2013-08-22", "Thursday"), ("2013-11-03", "Sunday")]) - - .split_column_by_example(source_column="pickup_time") - .split_column_by_example(source_column="dropoff_time") - - .split_column_by_example(source_column="pickup_time_1") - .split_column_by_example(source_column="dropoff_time_1") - .drop_columns(columns=[ - "pickup_date", "pickup_time", "dropoff_date", "dropoff_time", - "pickup_date_1", "dropoff_date_1", "pickup_time_1", "dropoff_time_1"]) - - .rename_columns(column_pairs={ - "pickup_date_2": "pickup_month", - "pickup_date_3": "pickup_monthday", - "pickup_time_1_1": "pickup_hour", - "pickup_time_1_2": "pickup_minute", - "pickup_time_2": "pickup_second", - "dropoff_date_2": "dropoff_month", - "dropoff_date_3": "dropoff_monthday", - "dropoff_time_1_1": "dropoff_hour", - "dropoff_time_1_2": "dropoff_minute", - "dropoff_time_2": "dropoff_second"})) - -# Drop the pickup_datetime and dropoff_datetime columns because they're -# no longer needed (granular time features like hour, -# minute and second are more useful for model training). -processed_df = transformed_features_df.drop_columns(columns=["pickup_datetime", "dropoff_datetime"]) +normalized_df = normalized_df.astype({"pickup_date": 'datetime64', "dropoff_date": 'datetime64', + "pickup_time": 'datetime64', "dropoff_time": 'datetime64', + "distance": 'float64', "cost": 'float64'}) -# Use the type inference functionality to automatically check the data type of each field, -# and display the inference results. -type_infer = processed_df.builders.set_column_types() -type_infer.learn() +normalized_df["pickup_weekday"] = normalized_df["pickup_date"].dt.dayofweek +normalized_df["pickup_month"] = normalized_df["pickup_date"].dt.month +normalized_df["pickup_monthday"] = normalized_df["pickup_date"].dt.day -# The inference results look correct based on the data. Now apply the type conversions to the dataflow. -type_converted_df = type_infer.to_dataflow() +normalized_df["dropoff_weekday"] = normalized_df["dropoff_date"].dt.dayofweek +normalized_df["dropoff_month"] = normalized_df["dropoff_date"].dt.month +normalized_df["dropoff_monthday"] = normalized_df["dropoff_date"].dt.day + +normalized_df["pickup_hour"] = normalized_df["pickup_time"].dt.hour +normalized_df["pickup_minute"] = normalized_df["pickup_time"].dt.minute +normalized_df["pickup_second"] = normalized_df["pickup_time"].dt.second + +normalized_df["dropoff_hour"] = normalized_df["dropoff_time"].dt.hour +normalized_df["dropoff_minute"] = normalized_df["dropoff_time"].dt.minute +normalized_df["dropoff_second"] = normalized_df["dropoff_time"].dt.second + +# Drop the pickup_date, dropoff_date, pickup_time, dropoff_time columns because they're +# no longer needed (granular time features like hour, +# minute and second are more useful for model training). +del normalized_df["pickup_date"] +del normalized_df["dropoff_date"] +del normalized_df["pickup_time"] +del normalized_df["dropoff_time"] -# Before you package the dataflow, run two final filters on the data set. +# Before you package the dataset, run two final filters on the dataset. # To eliminate incorrectly captured data points, -# filter the dataflow on records where both the cost and distance variable values are greater than zero. +# filter the dataset on records where both the cost and distance variable values are greater than zero. # This step will significantly improve machine learning model accuracy, # because data points with a zero cost or distance represent major outliers that throw off prediction accuracy. -final_df = type_converted_df.filter(dprep.col("distance") > 0) -final_df = final_df.filter(dprep.col("cost") > 0) +final_df = normalized_df[(normalized_df.distance > 0) & (normalized_df.cost > 0)] +final_df.reset_index(inplace=True, drop=True) # Writing the final dataframe to use for training in the following steps if not (args.output_transform is None): os.makedirs(args.output_transform, exist_ok=True) print("%s created" % args.output_transform) - write_df = final_df.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_transform)) - write_df.run_local() + path = args.output_transform + "/processed.parquet" + write_df = final_df.to_parquet(path) diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/featurization.py b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/featurization.py deleted file mode 100644 index bcf2338af..000000000 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/featurization.py +++ /dev/null @@ -1,31 +0,0 @@ -import argparse -import os -import azureml.dataprep as dprep -import azureml.core - -print("Extracts important features from prepared data") - -parser = argparse.ArgumentParser("featurization") -parser.add_argument("--input_featurization", type=str, help="input featurization") -parser.add_argument("--useful_columns", type=str, help="columns to use") -parser.add_argument("--output_featurization", type=str, help="output featurization") - -args = parser.parse_args() - -print("Argument 1(input training data path): %s" % args.input_featurization) -print("Argument 2(column features to use): %s" % str(args.useful_columns.strip("[]").split("\;"))) -print("Argument 3:(output featurized training data path) %s" % args.output_featurization) - -dflow_prepared = dprep.read_csv(args.input_featurization + '/part-*') - -# These functions extracts useful features for training -# Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail - -useful_columns = [s.strip().strip("'") for s in args.useful_columns.strip("[]").split("\;")] -dflow = dflow_prepared.keep_columns(useful_columns) - -if not (args.output_featurization is None): - os.makedirs(args.output_featurization, exist_ok=True) - print("%s created" % args.output_featurization) - write_df = dflow.write_to_csv(directory_path=dprep.LocalFileOutput(args.output_featurization)) - write_df.run_local() diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/get_data.py b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/get_data.py deleted file mode 100644 index 6472e46a2..000000000 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/get_data.py +++ /dev/null @@ -1,12 +0,0 @@ - -import os -import pandas as pd - - -def get_data(): - print("In get_data") - print(os.environ['AZUREML_DATAREFERENCE_output_split_train_x']) - X_train = pd.read_csv(os.environ['AZUREML_DATAREFERENCE_output_split_train_x'] + "/part-00000", header=0) - y_train = pd.read_csv(os.environ['AZUREML_DATAREFERENCE_output_split_train_y'] + "/part-00000", header=0) - - return {"X": X_train.values, "y": y_train.values.flatten()} diff --git a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/train_test_split.py b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/train_test_split.py index cdc80b619..48571e64f 100644 --- a/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/train_test_split.py +++ b/how-to-use-azureml/machine-learning-pipelines/nyc-taxi-data-regression-model-building/scripts/trainmodel/train_test_split.py @@ -1,48 +1,38 @@ import argparse import os -import azureml.dataprep as dprep import azureml.core +from azureml.core import Run from sklearn.model_selection import train_test_split def write_output(df, path): os.makedirs(path, exist_ok=True) print("%s created" % path) - df.to_csv(path + "/part-00000", index=False) + df.to_parquet(path + "/processed.parquet") print("Split the data into train and test") +run = Run.get_context() +transformed_data = run.input_datasets['transformed_data'] +transformed_df = transformed_data.to_pandas_dataframe() parser = argparse.ArgumentParser("split") -parser.add_argument("--input_split_features", type=str, help="input split features") -parser.add_argument("--input_split_labels", type=str, help="input split labels") -parser.add_argument("--output_split_train_x", type=str, help="output split train features") -parser.add_argument("--output_split_train_y", type=str, help="output split train labels") -parser.add_argument("--output_split_test_x", type=str, help="output split test features") -parser.add_argument("--output_split_test_y", type=str, help="output split test labels") +parser.add_argument("--output_split_train", type=str, help="output split train data") +parser.add_argument("--output_split_test", type=str, help="output split test data") args = parser.parse_args() -print("Argument 1(input taxi data features path): %s" % args.input_split_features) -print("Argument 2(input taxi data labels path): %s" % args.input_split_labels) -print("Argument 3(output training features split path): %s" % args.output_split_train_x) -print("Argument 4(output training labels split path): %s" % args.output_split_train_y) -print("Argument 5(output test features split path): %s" % args.output_split_test_x) -print("Argument 6(output test labels split path): %s" % args.output_split_test_y) - -x_df = dprep.read_csv(path=args.input_split_features, header=dprep.PromoteHeadersMode.GROUPED).to_pandas_dataframe() -y_df = dprep.read_csv(path=args.input_split_labels, header=dprep.PromoteHeadersMode.GROUPED).to_pandas_dataframe() +print("Argument 1(output training data split path): %s" % args.output_split_train) +print("Argument 2(output test data split path): %s" % args.output_split_test) # These functions splits the input features and labels into test and train data # Visit https://docs.microsoft.com/en-us/azure/machine-learning/service/tutorial-auto-train-models for more detail -x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223) +output_split_train, output_split_test = train_test_split(transformed_df, test_size=0.2, random_state=223) +output_split_train.reset_index(inplace=True, drop=True) +output_split_test.reset_index(inplace=True, drop=True) -if not (args.output_split_train_x is None and - args.output_split_test_x is None and - args.output_split_train_y is None and - args.output_split_test_y is None): - write_output(x_train, args.output_split_train_x) - write_output(y_train, args.output_split_train_y) - write_output(x_test, args.output_split_test_x) - write_output(y_test, args.output_split_test_y) +if not (args.output_split_train is None and + args.output_split_test is None): + write_output(output_split_train, args.output_split_train) + write_output(output_split_test, args.output_split_test) diff --git a/how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer.ipynb b/how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer.ipynb index 59cf1c123..0643b8a9d 100644 --- a/how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer.ipynb +++ b/how-to-use-azureml/machine-learning-pipelines/pipeline-style-transfer/pipeline-style-transfer.ipynb @@ -314,7 +314,7 @@ "cd = CondaDependencies()\n", "\n", "cd.add_channel(\"conda-forge\")\n", - "cd.add_conda_package(\"ffmpeg\")\n", + "cd.add_conda_package(\"ffmpeg==4.0.2\")\n", "\n", "# Runconfig\n", "amlcompute_run_config = RunConfiguration(conda_dependencies=cd)\n", @@ -334,8 +334,7 @@ "\n", "ffmpeg_images_ds_name = \"ffmpeg_images_data\"\n", "ffmpeg_images = PipelineData(name=\"ffmpeg_images\", datastore=default_datastore)\n", - "ffmpeg_images_file_dataset = ffmpeg_images.as_dataset()\n", - "ffmpeg_images_named_file_dataset = ffmpeg_images_file_dataset.as_named_input(ffmpeg_images_ds_name)" + "ffmpeg_images_file_dataset = ffmpeg_images.as_dataset()" ] }, { @@ -371,11 +370,11 @@ " script_name=\"process_video.py\",\n", " arguments=[\"--input_video\", orangutan_video,\n", " \"--output_audio\", ffmpeg_audio,\n", - " \"--output_images\", ffmpeg_images,\n", + " \"--output_images\", ffmpeg_images_file_dataset,\n", " ],\n", " compute_target=cpu_cluster,\n", " inputs=[orangutan_video],\n", - " outputs=[ffmpeg_images, ffmpeg_audio],\n", + " outputs=[ffmpeg_images_file_dataset, ffmpeg_audio],\n", " runconfig=amlcompute_run_config,\n", " source_directory=scripts_folder\n", ")\n", @@ -415,6 +414,7 @@ "parallel_cd.add_channel(\"pytorch\")\n", "parallel_cd.add_conda_package(\"pytorch\")\n", "parallel_cd.add_conda_package(\"torchvision\")\n", + "parallel_cd.add_conda_package(\"pillow<7\") # needed for torchvision==0.4.0\n", "\n", "styleenvironment = Environment(name=\"styleenvironment\")\n", "styleenvironment.python.conda_dependencies=parallel_cd\n", @@ -453,7 +453,7 @@ "\n", "distributed_style_transfer_step = ParallelRunStep(\n", " name=parallel_step_name,\n", - " inputs=[ffmpeg_images_named_file_dataset], # Input file share/blob container/file dataset\n", + " inputs=[ffmpeg_images_file_dataset], # Input file share/blob container/file dataset\n", " output=processed_images, # Output file share/blob container\n", " models=[mosaic_model, candy_model],\n", " tags = {'scenario': \"batch inference\", 'type': \"demo\"},\n", diff --git a/how-to-use-azureml/ml-frameworks/chainer/deployment/train-hyperparameter-tune-deploy-with-chainer/train-hyperparameter-tune-deploy-with-chainer.ipynb b/how-to-use-azureml/ml-frameworks/chainer/deployment/train-hyperparameter-tune-deploy-with-chainer/train-hyperparameter-tune-deploy-with-chainer.ipynb index a5ad0e1a1..94ce595ef 100644 --- a/how-to-use-azureml/ml-frameworks/chainer/deployment/train-hyperparameter-tune-deploy-with-chainer/train-hyperparameter-tune-deploy-with-chainer.ipynb +++ b/how-to-use-azureml/ml-frameworks/chainer/deployment/train-hyperparameter-tune-deploy-with-chainer/train-hyperparameter-tune-deploy-with-chainer.ipynb @@ -418,6 +418,15 @@ "hyperdrive_run.wait_for_completion(show_output=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert(hyperdrive_run.get_status() == \"Completed\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/how-to-use-azureml/ml-frameworks/pytorch/deployment/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb b/how-to-use-azureml/ml-frameworks/pytorch/deployment/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb index 4e621cfe4..936fb6273 100644 --- a/how-to-use-azureml/ml-frameworks/pytorch/deployment/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb +++ b/how-to-use-azureml/ml-frameworks/pytorch/deployment/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb @@ -440,6 +440,15 @@ "hyperdrive_run.wait_for_completion(show_output=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert(hyperdrive_run.get_status() == \"Completed\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/coco_eval.py b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/coco_eval.py new file mode 100644 index 000000000..2f032132a --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/coco_eval.py @@ -0,0 +1,350 @@ +import json +import tempfile + +import numpy as np +import copy +import time +import torch +import torch._six + +from pycocotools.cocoeval import COCOeval +from pycocotools.coco import COCO +import pycocotools.mask as mask_util + +from collections import defaultdict + +import utils + + +class CocoEvaluator(object): + def __init__(self, coco_gt, iou_types): + assert isinstance(iou_types, (list, tuple)) + coco_gt = copy.deepcopy(coco_gt) + self.coco_gt = coco_gt + + self.iou_types = iou_types + self.coco_eval = {} + for iou_type in iou_types: + self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) + + self.img_ids = [] + self.eval_imgs = {k: [] for k in iou_types} + + def update(self, predictions): + img_ids = list(np.unique(list(predictions.keys()))) + self.img_ids.extend(img_ids) + + for iou_type in self.iou_types: + results = self.prepare(predictions, iou_type) + coco_dt = loadRes(self.coco_gt, results) if results else COCO() + coco_eval = self.coco_eval[iou_type] + + coco_eval.cocoDt = coco_dt + coco_eval.params.imgIds = list(img_ids) + img_ids, eval_imgs = evaluate(coco_eval) + + self.eval_imgs[iou_type].append(eval_imgs) + + def synchronize_between_processes(self): + for iou_type in self.iou_types: + self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) + create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) + + def accumulate(self): + for coco_eval in self.coco_eval.values(): + coco_eval.accumulate() + + def summarize(self): + for iou_type, coco_eval in self.coco_eval.items(): + print("IoU metric: {}".format(iou_type)) + coco_eval.summarize() + + def prepare(self, predictions, iou_type): + if iou_type == "bbox": + return self.prepare_for_coco_detection(predictions) + elif iou_type == "segm": + return self.prepare_for_coco_segmentation(predictions) + elif iou_type == "keypoints": + return self.prepare_for_coco_keypoint(predictions) + else: + raise ValueError("Unknown iou type {}".format(iou_type)) + + def prepare_for_coco_detection(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + boxes = prediction["boxes"] + boxes = convert_to_xywh(boxes).tolist() + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + "bbox": box, + "score": scores[k], + } + for k, box in enumerate(boxes) + ] + ) + return coco_results + + def prepare_for_coco_segmentation(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + scores = prediction["scores"] + labels = prediction["labels"] + masks = prediction["masks"] + + masks = masks > 0.5 + + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + + rles = [ + mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + "segmentation": rle, + "score": scores[k], + } + for k, rle in enumerate(rles) + ] + ) + return coco_results + + def prepare_for_coco_keypoint(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + boxes = prediction["boxes"] + boxes = convert_to_xywh(boxes).tolist() + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + keypoints = prediction["keypoints"] + keypoints = keypoints.flatten(start_dim=1).tolist() + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + 'keypoints': keypoint, + "score": scores[k], + } + for k, keypoint in enumerate(keypoints) + ] + ) + return coco_results + + +def convert_to_xywh(boxes): + xmin, ymin, xmax, ymax = boxes.unbind(1) + return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) + + +def merge(img_ids, eval_imgs): + all_img_ids = utils.all_gather(img_ids) + all_eval_imgs = utils.all_gather(eval_imgs) + + merged_img_ids = [] + for p in all_img_ids: + merged_img_ids.extend(p) + + merged_eval_imgs = [] + for p in all_eval_imgs: + merged_eval_imgs.append(p) + + merged_img_ids = np.array(merged_img_ids) + merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) + + # keep only unique (and in sorted order) images + merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) + merged_eval_imgs = merged_eval_imgs[..., idx] + + return merged_img_ids, merged_eval_imgs + + +def create_common_coco_eval(coco_eval, img_ids, eval_imgs): + img_ids, eval_imgs = merge(img_ids, eval_imgs) + img_ids = list(img_ids) + eval_imgs = list(eval_imgs.flatten()) + + coco_eval.evalImgs = eval_imgs + coco_eval.params.imgIds = img_ids + coco_eval._paramsEval = copy.deepcopy(coco_eval.params) + + +################################################################# +# From pycocotools, just removed the prints and fixed +# a Python3 bug about unicode not defined +################################################################# + +# Ideally, pycocotools wouldn't have hard-coded prints +# so that we could avoid copy-pasting those two functions + +def createIndex(self): + # create index + # print('creating index...') + anns, cats, imgs = {}, {}, {} + imgToAnns, catToImgs = defaultdict(list), defaultdict(list) + if 'annotations' in self.dataset: + for ann in self.dataset['annotations']: + imgToAnns[ann['image_id']].append(ann) + anns[ann['id']] = ann + + if 'images' in self.dataset: + for img in self.dataset['images']: + imgs[img['id']] = img + + if 'categories' in self.dataset: + for cat in self.dataset['categories']: + cats[cat['id']] = cat + + if 'annotations' in self.dataset and 'categories' in self.dataset: + for ann in self.dataset['annotations']: + catToImgs[ann['category_id']].append(ann['image_id']) + + # print('index created!') + + # create class members + self.anns = anns + self.imgToAnns = imgToAnns + self.catToImgs = catToImgs + self.imgs = imgs + self.cats = cats + + +maskUtils = mask_util + + +def loadRes(self, resFile): + """ + Load result file and return a result api object. + :param resFile (str) : file name of result file + :return: res (obj) : result api object + """ + res = COCO() + res.dataset['images'] = [img for img in self.dataset['images']] + + # print('Loading and preparing results...') + # tic = time.time() + if isinstance(resFile, torch._six.string_classes): + anns = json.load(open(resFile)) + elif type(resFile) == np.ndarray: + anns = self.loadNumpyAnnotations(resFile) + else: + anns = resFile + assert type(anns) == list, 'results in not an array of objects' + annsImgIds = [ann['image_id'] for ann in anns] + assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ + 'Results do not correspond to current coco set' + if 'caption' in anns[0]: + imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) + res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] + for id, ann in enumerate(anns): + ann['id'] = id + 1 + elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: + res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + for id, ann in enumerate(anns): + bb = ann['bbox'] + x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]] + if 'segmentation' not in ann: + ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] + ann['area'] = bb[2] * bb[3] + ann['id'] = id + 1 + ann['iscrowd'] = 0 + elif 'segmentation' in anns[0]: + res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + for id, ann in enumerate(anns): + # now only support compressed RLE format as segmentation results + ann['area'] = maskUtils.area(ann['segmentation']) + if 'bbox' not in ann: + ann['bbox'] = maskUtils.toBbox(ann['segmentation']) + ann['id'] = id + 1 + ann['iscrowd'] = 0 + elif 'keypoints' in anns[0]: + res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + for id, ann in enumerate(anns): + s = ann['keypoints'] + x = s[0::3] + y = s[1::3] + x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y) + ann['area'] = (x2 - x1) * (y2 - y1) + ann['id'] = id + 1 + ann['bbox'] = [x1, y1, x2 - x1, y2 - y1] + # print('DONE (t={:0.2f}s)'.format(time.time()- tic)) + + res.dataset['annotations'] = anns + createIndex(res) + return res + + +def evaluate(self): + ''' + Run per image evaluation on given images and store results (a list of dict) in self.evalImgs + :return: None + ''' + # tic = time.time() + # print('Running per image evaluation...') + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = 'segm' if p.useSegm == 1 else 'bbox' + print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) + # print('Evaluate annotation type *{}*'.format(p.iouType)) + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + self._prepare() + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType == 'segm' or p.iouType == 'bbox': + computeIoU = self.computeIoU + elif p.iouType == 'keypoints': + computeIoU = self.computeOks + self.ious = { + (imgId, catId): computeIoU(imgId, catId) + for imgId in p.imgIds + for catId in catIds} + + evaluateImg = self.evaluateImg + maxDet = p.maxDets[-1] + evalImgs = [ + evaluateImg(imgId, catId, areaRng, maxDet) + for catId in catIds + for areaRng in p.areaRng + for imgId in p.imgIds + ] + # this is NOT in the pycocotools code, but could be done outside + evalImgs = np.asarray(evalImgs).reshape( + len(catIds), len(p.areaRng), len(p.imgIds)) + self._paramsEval = copy.deepcopy(self.params) + # toc = time.time() + # print('DONE (t={:0.2f}s).'.format(toc-tic)) + return p.imgIds, evalImgs + +################################################################# +# end of straight copy from pycocotools, just removing the prints +################################################################# diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/coco_utils.py b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/coco_utils.py new file mode 100644 index 000000000..26701a2cb --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/coco_utils.py @@ -0,0 +1,252 @@ +import copy +import os +from PIL import Image + +import torch +import torch.utils.data +import torchvision + +from pycocotools import mask as coco_mask +from pycocotools.coco import COCO + +import transforms as T + + +class FilterAndRemapCocoCategories(object): + def __init__(self, categories, remap=True): + self.categories = categories + self.remap = remap + + def __call__(self, image, target): + anno = target["annotations"] + anno = [obj for obj in anno if obj["category_id"] in self.categories] + if not self.remap: + target["annotations"] = anno + return image, target + anno = copy.deepcopy(anno) + for obj in anno: + obj["category_id"] = self.categories.index(obj["category_id"]) + target["annotations"] = anno + return image, target + + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask(object): + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + image_id = torch.tensor([image_id]) + + anno = target["annotations"] + + anno = [obj for obj in anno if obj['iscrowd'] == 0] + + boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = torch.tensor(classes, dtype=torch.int64) + + segmentations = [obj["segmentation"] for obj in anno] + masks = convert_coco_poly_to_mask(segmentations, h, w) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = torch.as_tensor(keypoints, dtype=torch.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + masks = masks[keep] + if keypoints is not None: + keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + target["masks"] = masks + target["image_id"] = image_id + if keypoints is not None: + target["keypoints"] = keypoints + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] for obj in anno]) + target["area"] = area + target["iscrowd"] = iscrowd + + return image, target + + +def _coco_remove_images_without_annotations(dataset, cat_list=None): + def _has_only_empty_bbox(anno): + return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) + + def _count_visible_keypoints(anno): + return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) + + min_keypoints_per_image = 10 + + def _has_valid_annotation(anno): + # if it's empty, there is no annotation + if len(anno) == 0: + return False + # if all boxes have close to zero area, there is no annotation + if _has_only_empty_bbox(anno): + return False + # keypoints task have a slight different critera for considering + # if an annotation is valid + if "keypoints" not in anno[0]: + return True + # for keypoint detection tasks, only consider valid images those + # containing at least min_keypoints_per_image + if _count_visible_keypoints(anno) >= min_keypoints_per_image: + return True + return False + + assert isinstance(dataset, torchvision.datasets.CocoDetection) + ids = [] + for ds_idx, img_id in enumerate(dataset.ids): + ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None) + anno = dataset.coco.loadAnns(ann_ids) + if cat_list: + anno = [obj for obj in anno if obj["category_id"] in cat_list] + if _has_valid_annotation(anno): + ids.append(ds_idx) + + dataset = torch.utils.data.Subset(dataset, ids) + return dataset + + +def convert_to_coco_api(ds): + coco_ds = COCO() + # annotation IDs need to start at 1, not 0, see torchvision issue #1530 + ann_id = 1 + dataset = {'images': [], 'categories': [], 'annotations': []} + categories = set() + for img_idx in range(len(ds)): + # find better way to get target + # targets = ds.get_annotations(img_idx) + img, targets = ds[img_idx] + image_id = targets["image_id"].item() + img_dict = {} + img_dict['id'] = image_id + img_dict['height'] = img.shape[-2] + img_dict['width'] = img.shape[-1] + dataset['images'].append(img_dict) + bboxes = targets["boxes"] + bboxes[:, 2:] -= bboxes[:, :2] + bboxes = bboxes.tolist() + labels = targets['labels'].tolist() + areas = targets['area'].tolist() + iscrowd = targets['iscrowd'].tolist() + if 'masks' in targets: + masks = targets['masks'] + # make masks Fortran contiguous for coco_mask + masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1) + if 'keypoints' in targets: + keypoints = targets['keypoints'] + keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist() + num_objs = len(bboxes) + for i in range(num_objs): + ann = {} + ann['image_id'] = image_id + ann['bbox'] = bboxes[i] + ann['category_id'] = labels[i] + categories.add(labels[i]) + ann['area'] = areas[i] + ann['iscrowd'] = iscrowd[i] + ann['id'] = ann_id + if 'masks' in targets: + ann["segmentation"] = coco_mask.encode(masks[i].numpy()) + if 'keypoints' in targets: + ann['keypoints'] = keypoints[i] + ann['num_keypoints'] = sum(k != 0 for k in keypoints[i][2::3]) + dataset['annotations'].append(ann) + ann_id += 1 + dataset['categories'] = [{'id': i} for i in sorted(categories)] + coco_ds.dataset = dataset + coco_ds.createIndex() + return coco_ds + + +def get_coco_api_from_dataset(dataset): + for _ in range(10): + if isinstance(dataset, torchvision.datasets.CocoDetection): + break + if isinstance(dataset, torch.utils.data.Subset): + dataset = dataset.dataset + if isinstance(dataset, torchvision.datasets.CocoDetection): + return dataset.coco + return convert_to_coco_api(dataset) + + +class CocoDetection(torchvision.datasets.CocoDetection): + def __init__(self, img_folder, ann_file, transforms): + super(CocoDetection, self).__init__(img_folder, ann_file) + self._transforms = transforms + + def __getitem__(self, idx): + img, target = super(CocoDetection, self).__getitem__(idx) + image_id = self.ids[idx] + target = dict(image_id=image_id, annotations=target) + if self._transforms is not None: + img, target = self._transforms(img, target) + return img, target + + +def get_coco(root, image_set, transforms, mode='instances'): + anno_file_template = "{}_{}2017.json" + PATHS = { + "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))), + "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))), + # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))) + } + + t = [ConvertCocoPolysToMask()] + + if transforms is not None: + t.append(transforms) + transforms = T.Compose(t) + + img_folder, ann_file = PATHS[image_set] + img_folder = os.path.join(root, img_folder) + ann_file = os.path.join(root, ann_file) + + dataset = CocoDetection(img_folder, ann_file, transforms=transforms) + + if image_set == "train": + dataset = _coco_remove_images_without_annotations(dataset) + + # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)]) + + return dataset + + +def get_coco_kp(root, image_set, transforms): + return get_coco(root, image_set, transforms, mode="person_keypoints") diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/data.py b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/data.py new file mode 100644 index 000000000..6b8ee4929 --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/data.py @@ -0,0 +1,77 @@ +import numpy as np +import os +import torch.utils.data + +from azureml.core import Run +from PIL import Image + + +class PennFudanDataset(torch.utils.data.Dataset): + def __init__(self, root, transforms=None): + self.root = root + self.transforms = transforms + + # load all image files, sorting them to ensure that they are aligned + self.img_dir = os.path.join(root, "PNGImages") + self.mask_dir = os.path.join(root, "PedMasks") + + self.imgs = list(sorted(os.listdir(self.img_dir))) + self.masks = list(sorted(os.listdir(self.mask_dir))) + + def __getitem__(self, idx): + # load images ad masks + img_path = os.path.join(self.img_dir, self.imgs[idx]) + mask_path = os.path.join(self.mask_dir, self.masks[idx]) + + img = Image.open(img_path).convert("RGB") + # note that we haven't converted the mask to RGB, + # because each color corresponds to a different instance + # with 0 being background + mask = Image.open(mask_path) + + mask = np.array(mask) + # instances are encoded as different colors + obj_ids = np.unique(mask) + # first id is the background, so remove it + obj_ids = obj_ids[1:] + + # split the color-encoded mask into a set + # of binary masks + masks = mask == obj_ids[:, None, None] + + # get bounding box coordinates for each mask + num_objs = len(obj_ids) + boxes = [] + for i in range(num_objs): + pos = np.where(masks[i]) + xmin = np.min(pos[1]) + xmax = np.max(pos[1]) + ymin = np.min(pos[0]) + ymax = np.max(pos[0]) + boxes.append([xmin, ymin, xmax, ymax]) + + boxes = torch.as_tensor(boxes, dtype=torch.float32) + # there is only one class + labels = torch.ones((num_objs,), dtype=torch.int64) + masks = torch.as_tensor(masks, dtype=torch.uint8) + + image_id = torch.tensor([idx]) + area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) + # suppose all instances are not crowd + iscrowd = torch.zeros((num_objs,), dtype=torch.int64) + + target = {} + target["boxes"] = boxes + target["labels"] = labels + target["masks"] = masks + target["image_id"] = image_id + target["area"] = area + target["iscrowd"] = iscrowd + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target + + def __len__(self): + return len(self.imgs) diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/dockerfiles/Dockerfile b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/dockerfiles/Dockerfile new file mode 100644 index 000000000..9b76f6001 --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/dockerfiles/Dockerfile @@ -0,0 +1,16 @@ +# From https://github.com/microsoft/AzureML-BERT/blob/master/finetune/PyTorch/dockerfile + +FROM mcr.microsoft.com/azureml/base-gpu:openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04 + +RUN apt update && apt install git -y && rm -rf /var/lib/apt/lists/* + +RUN /opt/miniconda/bin/conda update -n base -c defaults conda +RUN /opt/miniconda/bin/conda install -y cython=0.29.15 numpy=1.18.1 +RUN /opt/miniconda/bin/conda install -y pytorch=1.4 torchvision=0.5.0 -c pytorch + +# Install cocoapi, required for drawing bounding boxes +RUN git clone https://github.com/cocodataset/cocoapi.git && cd cocoapi/PythonAPI && python setup.py build_ext install + +RUN pip install azureml-defaults +RUN pip install "azureml-dataprep[fuse]" +RUN pip install pandas pyarrow diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/engine.py b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/engine.py new file mode 100644 index 000000000..68c39a4fc --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/engine.py @@ -0,0 +1,108 @@ +import math +import sys +import time +import torch + +import torchvision.models.detection.mask_rcnn + +from coco_utils import get_coco_api_from_dataset +from coco_eval import CocoEvaluator +import utils + + +def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): + model.train() + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) + header = 'Epoch: [{}]'.format(epoch) + + lr_scheduler = None + if epoch == 0: + warmup_factor = 1. / 1000 + warmup_iters = min(1000, len(data_loader) - 1) + + lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) + + for images, targets in metric_logger.log_every(data_loader, print_freq, header): + images = list(image.to(device) for image in images) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + + loss_dict = model(images, targets) + + losses = sum(loss for loss in loss_dict.values()) + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = utils.reduce_dict(loss_dict) + losses_reduced = sum(loss for loss in loss_dict_reduced.values()) + + loss_value = losses_reduced.item() + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + print(loss_dict_reduced) + sys.exit(1) + + optimizer.zero_grad() + losses.backward() + optimizer.step() + + if lr_scheduler is not None: + lr_scheduler.step() + + metric_logger.update(loss=losses_reduced, **loss_dict_reduced) + metric_logger.update(lr=optimizer.param_groups[0]["lr"]) + + +def _get_iou_types(model): + model_without_ddp = model + if isinstance(model, torch.nn.parallel.DistributedDataParallel): + model_without_ddp = model.module + iou_types = ["bbox"] + if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN): + iou_types.append("segm") + if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN): + iou_types.append("keypoints") + return iou_types + + +@torch.no_grad() +def evaluate(model, data_loader, device): + n_threads = torch.get_num_threads() + # FIXME remove this and make paste_masks_in_image run on the GPU + torch.set_num_threads(1) + cpu_device = torch.device("cpu") + model.eval() + metric_logger = utils.MetricLogger(delimiter=" ") + header = 'Test:' + + coco = get_coco_api_from_dataset(data_loader.dataset) + iou_types = _get_iou_types(model) + coco_evaluator = CocoEvaluator(coco, iou_types) + + for image, targets in metric_logger.log_every(data_loader, 100, header): + image = list(img.to(device) for img in image) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + + torch.cuda.synchronize() + model_time = time.time() + outputs = model(image) + + outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] + model_time = time.time() - model_time + + res = {target["image_id"].item(): output for target, output in zip(targets, outputs)} + evaluator_time = time.time() + coco_evaluator.update(res) + evaluator_time = time.time() - evaluator_time + metric_logger.update(model_time=model_time, evaluator_time=evaluator_time) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + coco_evaluator.synchronize_between_processes() + + # accumulate predictions from all images + coco_evaluator.accumulate() + coco_evaluator.summarize() + torch.set_num_threads(n_threads) + return coco_evaluator diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/model.py b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/model.py new file mode 100644 index 000000000..12e32effa --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/model.py @@ -0,0 +1,23 @@ +import torchvision + +from torchvision.models.detection.faster_rcnn import FastRCNNPredictor +from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor + + +def get_instance_segmentation_model(num_classes): + # load an instance segmentation model pre-trained on COCO + model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True) + + # get the number of input features for the classifier + in_features = model.roi_heads.box_predictor.cls_score.in_features + # replace the pre-trained head with a new one + model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes) + + # now get the number of input features for the mask classifier + in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels + hidden_layer = 256 + # and replace the mask predictor with a new one + model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, + hidden_layer, + num_classes) + return model diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.ipynb b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.ipynb new file mode 100644 index 000000000..e21a40aba --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.ipynb @@ -0,0 +1,544 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/MachineLearningNotebooks/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Object detection with PyTorch, Mask R-CNN, and a custom Dockerfile\n", + "\n", + "In this tutorial, you will finetune a pre-trained [Mask R-CNN](https://arxiv.org/abs/1703.06870) model on images from the [Penn-Fudan Database for Pedestrian Detection and Segmentation](https://www.cis.upenn.edu/~jshi/ped_html/). The dataset has 170 images with 345 instances of pedestrians. After running this tutorial, you will have a model that can outline the silhouettes of all pedestrians within an image.\n", + "\n", + "You\u00e2\u20ac\u2122ll use Azure Machine Learning to: \n", + "\n", + "- Initialize a workspace \n", + "- Create a compute cluster\n", + "- Define a training environment\n", + "- Train a model remotely\n", + "- Register your model\n", + "- Generate predictions locally\n", + "\n", + "## Prerequisities\n", + "\n", + "- If you are using an Azure Machine Learning Notebook VM, your environment already meets these prerequisites. Otherwise, go through the [configuration notebook](../../../../../configuration.ipynb) to install the Azure Machine Learning Python SDK and [create an Azure ML Workspace](https://docs.microsoft.com/azure/machine-learning/how-to-manage-workspace#create-a-workspace). You also need matplotlib 3.2, pycocotools-2.0.0, torchvision >= 0.5.0 and torch >= 1.4.0.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number, check other dependencies\n", + "import azureml.core\n", + "import matplotlib\n", + "import pycocotools\n", + "import torch\n", + "import torchvision\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Diagnostics\n", + "\n", + "Opt-in diagnostics for better experience, quality, and security in future releases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.telemetry import set_diagnostics_collection\n", + "\n", + "set_diagnostics_collection(send_diagnostics=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize a workspace\n", + "\n", + "Initialize a [workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-workspace) object from the existing workspace you created in the Prerequisites step. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`, using the [from_config()](https://docs.microsoft.com/python/api/azureml-core/azureml.core.workspace(class)?view=azure-ml-py#from-config-path-none--auth-none---logger-none---file-name-none-) method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.workspace import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create or attach existing Azure ML Managed Compute\n", + "\n", + "You will need to create a [compute target](https://docs.microsoft.com/azure/machine-learning/concept-compute-target) for training your model. In this tutorial, we use [Azure ML managed compute](https://docs.microsoft.com/azure/machine-learning/how-to-set-up-training-targets#amlcompute) for our remote training compute resource. Specifically, the below code creates a `STANDARD_NC6` GPU cluster that autoscales from 0 to 4 nodes.\n", + "\n", + "**Creation of Compute takes approximately 5 minutes.** If the Aauzre ML Compute with that name is already in your workspace, this code will skip the creation process. \n", + "\n", + "As with other Azure servies, there are limits on certain resources associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/azure/machine-learning/how-to-manage-quotas) on the default limits and how to request more quota.\n", + "\n", + "> Note that the below code creates GPU compute. If you instead want to create CPU compute, provide a different VM size to the `vm_size` parameter, such as `STANDARD_D2_V2`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import ComputeTarget, AmlCompute\n", + "from azureml.core.compute_target import ComputeTargetException\n", + "\n", + "\n", + "# choose a name for your cluster\n", + "cluster_name = 'gpu-cluster'\n", + "\n", + "try:\n", + " compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n", + " print('Found existing compute target.')\n", + "except ComputeTargetException:\n", + " print('Creating a new compute target...')\n", + " compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', \n", + " max_nodes=4)\n", + "\n", + " # create the cluster\n", + " compute_target = ComputeTarget.create(ws, cluster_name, compute_config)\n", + "\n", + " compute_target.wait_for_completion(show_output=True)\n", + "\n", + "# use get_status() to get a detailed status for the current cluster. \n", + "print(compute_target.get_status().serialize())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define a training environment\n", + "\n", + "### Create a project directory\n", + "Create a directory that will contain all the code from your local machine that you will need access to on the remote resource. This includes the training script an any additional files your training script depends on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "project_folder = './pytorch-peds'\n", + "\n", + "try:\n", + " os.makedirs(project_folder, exist_ok=False)\n", + "except FileExistsError:\n", + " print('project folder {} exists, moving on...'.format(project_folder))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Copy training script and dependencies into project directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "\n", + "files_to_copy = ['data', 'model', 'script', 'utils', 'transforms', 'coco_eval', 'engine', 'coco_utils']\n", + "for file in files_to_copy:\n", + " shutil.copy(os.path.join(os.getcwd(), (file + '.py')), project_folder)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Experiment\n", + "\n", + "experiment_name = 'pytorch-peds'\n", + "experiment = Experiment(ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Specify dependencies with a custom Dockerfile\n", + "\n", + "There are a number of ways to [use environments](https://docs.microsoft.com/azure/machine-learning/how-to-use-environments) for specifying dependencies during model training. In this case, we use a custom Dockerfile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Environment\n", + "\n", + "my_env = Environment(name='maskr-docker')\n", + "my_env.docker.enabled = True\n", + "with open(\"dockerfiles/Dockerfile\", \"r\") as f:\n", + " dockerfile_contents=f.read()\n", + "my_env.docker.base_dockerfile=dockerfile_contents\n", + "my_env.docker.base_image = None\n", + "my_env.python.interpreter_path = '/opt/miniconda/bin/python'\n", + "my_env.python.user_managed_dependencies = True\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a ScriptRunConfig\n", + "\n", + "Use the [ScriptRunConfig](https://docs.microsoft.com/python/api/azureml-core/azureml.core.scriptrunconfig?view=azure-ml-py) class to define your run. Specify the source directory, compute target, and environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.train.dnn import PyTorch\n", + "from azureml.core import ScriptRunConfig\n", + "\n", + "model_name = 'pytorch-peds'\n", + "output_dir = './outputs/'\n", + "n_epochs = 2\n", + "\n", + "script_args = [\n", + " '--model_name', model_name,\n", + " '--output_dir', output_dir,\n", + " '--n_epochs', n_epochs,\n", + "]\n", + "# Add training script to run config\n", + "runconfig = ScriptRunConfig(\n", + " source_directory=project_folder,\n", + " script=\"script.py\",\n", + " arguments=script_args)\n", + "\n", + "# Attach compute target to run config\n", + "runconfig.run_config.target = cluster_name\n", + "\n", + "# Uncomment the line below if you want to try this locally first\n", + "#runconfig.run_config.target = \"local\"\n", + "\n", + "# Attach environment to run config\n", + "runconfig.run_config.environment = my_env" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train remotely\n", + "\n", + "### Submit your run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Submit run \n", + "run = experiment.submit(runconfig)\n", + "\n", + "# to get more details of your run\n", + "print(run.get_details())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monitor your run\n", + "\n", + "Use a widget to keep track of your run. You can also view the status of the run within the [Azure Machine Learning service portal](https://ml.azure.com)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.widgets import RunDetails\n", + "\n", + "RunDetails(run).show()\n", + "run.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test your model\n", + "\n", + "Now that we are done training, let's see how well this model actually performs.\n", + "\n", + "### Get your latest run\n", + "First, pull the latest run using `experiment.get_runs()`, which lists runs from `experiment` in reverse chronological order." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Run\n", + "\n", + "last_run = next(experiment.get_runs())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register your model\n", + "Next, [register the model](https://docs.microsoft.com/azure/machine-learning/concept-model-management-and-deployment#register-package-and-deploy-models-from-anywhere) from your run. Registering your model assigns it a version and helps you with auditability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "last_run.register_model(model_name=model_name, model_path=os.path.join(output_dir, model_name))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download your model\n", + "Next, download this registered model. Notice how we can initialize the `Model` object with the name of the registered model, rather than a path to the file itself." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Model\n", + "\n", + "model = Model(workspace=ws, name=model_name)\n", + "path = model.download(target_dir='model', exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use your model to make a prediction\n", + "\n", + "Run inferencing on a single test image and display the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from azureml.core import Dataset\n", + "from data import PennFudanDataset\n", + "from script import get_transform, download_data, NUM_CLASSES\n", + "from model import get_instance_segmentation_model\n", + "\n", + "if torch.cuda.is_available():\n", + " device = torch.device('cuda')\n", + "else:\n", + " device = torch.device('cpu')\n", + "\n", + "# Instantiate model with correct weights, cast to correct device, place in evaluation mode\n", + "predict_model = get_instance_segmentation_model(NUM_CLASSES)\n", + "predict_model.to(device)\n", + "predict_model.load_state_dict(torch.load(path, map_location=device))\n", + "predict_model.eval()\n", + "\n", + "# Load dataset\n", + "root_dir=download_data()\n", + "dataset_test = PennFudanDataset(root=root_dir, transforms=get_transform(train=False))\n", + "\n", + "# pick one image from the test set\n", + "img, _ = dataset_test[0]\n", + "\n", + "with torch.no_grad():\n", + " prediction = predict_model([img.to(device)])\n", + "\n", + "# model = torch.load(path)\n", + "#torch.load(model.get_model_path(model_name='outputs/model.pt'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Display the input image\n", + "\n", + "While tensors are great for computers, a tensor of RGB values doesn't mean much to a human. Let's display the input image in a way that a human could understand." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "\n", + "\n", + "Image.fromarray(img.mul(255).permute(1, 2, 0).byte().numpy())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Display the predicted masks\n", + "\n", + "The prediction consists of masks, displaying the outline of pedestrians in the image. Let's take a look at the first two masks, below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Image.fromarray(prediction[0]['masks'][0, 0].mul(255).byte().cpu().numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Image.fromarray(prediction[0]['masks'][1, 0].mul(255).byte().cpu().numpy())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "Congratulations! You just trained a Mask R-CNN model with PyTorch in Azure Machine Learning. As next steps, consider:\n", + "1. Learn more about using PyTorch in Azure Machine Learning service by checking out the [README](./README.md]\n", + "2. Try exporting your model to [ONNX](https://docs.microsoft.com/azure/machine-learning/concept-onnx) for accelerated inferencing." + ] + } + ], + "metadata": { + "authors": [ + { + "name": "gopalv" + } + ], + "category": "training", + "compute": [ + "AML Compute" + ], + "datasets": [ + "Custom" + ], + "deployment": [ + "None" + ], + "exclude_from_index": false, + "framework": [ + "PyTorch" + ], + "friendly_name": "PyTorch object detection", + "index_order": 1, + "kernel_info": { + "name": "python3" + }, + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5-final" + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "tags": [ + "remote run", + "docker" + ], + "task": "Fine-tune PyTorch object detection model with a custom dockerfile" + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.yml b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.yml new file mode 100644 index 000000000..4302c3493 --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.yml @@ -0,0 +1,14 @@ +name: pytorch-mask-rcnn +dependencies: +- cython +- pytorch -c pytorch +- torchvision -c pytorch +- pip: + - azureml-sdk + - azureml-widgets + - azureml-dataprep + - fuse + - pandas + - matplotlib + - pillow==7.0.0 + - git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/script.py b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/script.py new file mode 100644 index 000000000..5851cffaf --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/script.py @@ -0,0 +1,117 @@ +import argparse +import os +import torch +import torchvision +import transforms as T +import urllib.request +import utils + +from azureml.core import Dataset, Run +from data import PennFudanDataset +from engine import train_one_epoch, evaluate +from model import get_instance_segmentation_model +from zipfile import ZipFile + +NUM_CLASSES = 2 + + +def download_data(): + data_file = 'PennFudanPed.zip' + ds_path = 'PennFudanPed/' + urllib.request.urlretrieve('https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip', data_file) + zip = ZipFile(file=data_file) + zip.extractall(path=ds_path) + return os.path.join(ds_path, zip.namelist()[0]) + + +def get_transform(train): + transforms = [] + # converts the image, a PIL image, into a PyTorch Tensor + transforms.append(T.ToTensor()) + if train: + # during training, randomly flip the training images + # and ground-truth for data augmentation + transforms.append(T.RandomHorizontalFlip(0.5)) + return T.Compose(transforms) + + +def main(): + print("Torch version:", torch.__version__) + # get command-line arguments + parser = argparse.ArgumentParser() + parser.add_argument('--model_name', type=str, default="pytorch-peds.pt", + help='name with which to register your model') + parser.add_argument('--output_dir', default="local-outputs", + type=str, help='output directory') + parser.add_argument('--n_epochs', type=int, + default=10, help='number of epochs') + args = parser.parse_args() + + # In case user inputs a nested output directory + os.makedirs(name=args.output_dir, exist_ok=True) + + # Get a dataset by name + root_dir = download_data() + + # use our dataset and defined transformations + dataset = PennFudanDataset(root=root_dir, transforms=get_transform(train=True)) + dataset_test = PennFudanDataset(root=root_dir, transforms=get_transform(train=False)) + + # split the dataset in train and test set + torch.manual_seed(1) + indices = torch.randperm(len(dataset)).tolist() + dataset = torch.utils.data.Subset(dataset, indices[:-50]) + dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:]) + + # define training and validation data loaders + data_loader = torch.utils.data.DataLoader( + dataset, batch_size=2, shuffle=True, num_workers=4, + collate_fn=utils.collate_fn) + + data_loader_test = torch.utils.data.DataLoader( + dataset_test, batch_size=1, shuffle=False, num_workers=4, + collate_fn=utils.collate_fn) + + if torch.cuda.is_available(): + print('Using GPU') + device = torch.device('cuda') + else: + print('Using CPU') + device = torch.device('cpu') + + # our dataset has two classes only - background and person + num_classes = NUM_CLASSES + + # get the model using our helper function + model = get_instance_segmentation_model(num_classes) + + # move model to the right device + model.to(device) + + # construct an optimizer + params = [p for p in model.parameters() if p.requires_grad] + optimizer = torch.optim.SGD(params, lr=0.005, + momentum=0.9, weight_decay=0.0005) + + # and a learning rate scheduler which decreases the learning rate by + # 10x every 3 epochs + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, + step_size=3, + gamma=0.1) + + for epoch in range(args.n_epochs): + # train for one epoch, printing every 10 iterations + train_one_epoch( + model, optimizer, data_loader, device, epoch, print_freq=10) + # update the learning rate + lr_scheduler.step() + # evaluate on the test dataset + evaluate(model, data_loader_test, device=device) + + # Saving the state dict is recommended method, per + # https://pytorch.org/tutorials/beginner/saving_loading_models.html + torch.save(model.state_dict(), os.path.join(args.output_dir, args.model_name)) + + +if __name__ == '__main__': + main() diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/transforms.py b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/transforms.py new file mode 100644 index 000000000..73efc92bd --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/transforms.py @@ -0,0 +1,50 @@ +import random +import torch + +from torchvision.transforms import functional as F + + +def _flip_coco_person_keypoints(kps, width): + flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15] + flipped_data = kps[:, flip_inds] + flipped_data[..., 0] = width - flipped_data[..., 0] + # Maintain COCO convention that if visibility == 0, then x, y = 0 + inds = flipped_data[..., 2] == 0 + flipped_data[inds] = 0 + return flipped_data + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + +class RandomHorizontalFlip(object): + def __init__(self, prob): + self.prob = prob + + def __call__(self, image, target): + if random.random() < self.prob: + height, width = image.shape[-2:] + image = image.flip(-1) + bbox = target["boxes"] + bbox[:, [0, 2]] = width - bbox[:, [2, 0]] + target["boxes"] = bbox + if "masks" in target: + target["masks"] = target["masks"].flip(-1) + if "keypoints" in target: + keypoints = target["keypoints"] + keypoints = _flip_coco_person_keypoints(keypoints, width) + target["keypoints"] = keypoints + return image, target + + +class ToTensor(object): + def __call__(self, image, target): + image = F.to_tensor(image) + return image, target diff --git a/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/utils.py b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/utils.py new file mode 100644 index 000000000..0e8e85601 --- /dev/null +++ b/how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/utils.py @@ -0,0 +1,326 @@ +from __future__ import print_function + +from collections import defaultdict, deque +import datetime +import pickle +import time + +import torch +import torch.distributed as dist + +import errno +import os + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + + # serialized to a Tensor + buffer = pickle.dumps(data) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to("cuda") + + # obtain Tensor size of each rank + local_size = torch.tensor([tensor.numel()], device="cuda") + size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] + dist.all_gather(size_list, local_size) + size_list = [int(size.item()) for size in size_list] + max_size = max(size_list) + + # receiving Tensor from all ranks + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + tensor_list = [] + for _ in size_list: + tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) + if local_size != max_size: + padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") + tensor = torch.cat((tensor, padding), dim=0) + dist.all_gather(tensor_list, tensor) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def reduce_dict(input_dict, average=True): + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that all processes + have the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.all_reduce(values) + if average: + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + if torch.cuda.is_available(): + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}', + 'max mem: {memory:.0f}' + ]) + else: + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ]) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len(iterable) - 1: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + + +def collate_fn(batch): + return tuple(zip(*batch)) + + +def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor): + + def f(x): + if x >= warmup_iters: + return 1 + alpha = float(x) / warmup_iters + return warmup_factor * (1 - alpha) + alpha + + return torch.optim.lr_scheduler.LambdaLR(optimizer, f) + + +def mkdir(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + else: + print('Not using distributed mode') + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}'.format( + args.rank, args.dist_url), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) diff --git a/how-to-use-azureml/ml-frameworks/scikit-learn/training/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb b/how-to-use-azureml/ml-frameworks/scikit-learn/training/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb index a501e22fc..281864b32 100644 --- a/how-to-use-azureml/ml-frameworks/scikit-learn/training/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb +++ b/how-to-use-azureml/ml-frameworks/scikit-learn/training/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb @@ -487,6 +487,15 @@ "hyperdrive_run.wait_for_completion(show_output=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert(hyperdrive_run.get_status() == \"Completed\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb index 1d4f2de7d..314161c1c 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb +++ b/how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb @@ -255,7 +255,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You may want to regiester datasets using the register() method to your workspace so they can be shared with others, reused across various experiments, and referred to by name in your training script." + "You may want to regiester datasets using the register() method to your workspace so they can be shared with others, reused across various experiments, and referred to by name in your training script.\n", + "You can try get the dataset first to see if it's already registered." ] }, { @@ -264,10 +265,18 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = dataset.register(workspace = ws,\n", - " name = 'mnist dataset',\n", - " description='training and test dataset',\n", - " create_new_version=True)\n", + "dataset_registered = False\n", + "try:\n", + " temp = Dataset.get_by_name(workspace = ws, name = 'mnist-dataset')\n", + " dataset_registered = True\n", + "except:\n", + " print(\"The dataset mnist-dataset is not registered in workspace yet.\")\n", + "\n", + "if not dataset_registered:\n", + " dataset = dataset.register(workspace = ws,\n", + " name = 'mnist-dataset',\n", + " description='training and test dataset',\n", + " create_new_version=True)\n", "# list the files referenced by dataset\n", "dataset.to_path()" ] @@ -823,6 +832,15 @@ "htr.wait_for_completion(show_output=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert(htr.get_status() == \"Completed\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb b/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb index cddc5d421..aabcacc51 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb +++ b/how-to-use-azureml/ml-frameworks/tensorflow/training/hyperparameter-tune-and-warm-start-with-tensorflow/hyperparameter-tune-and-warm-start-with-tensorflow.ipynb @@ -255,7 +255,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use the register() method to register datasets to your workspace so they can be shared with others, reused across various experiments, and referred to by name in your training script." + "Use the register() method to register datasets to your workspace so they can be shared with others, reused across various experiments, and referred to by name in your training script.\n", + "You can try get the dataset first to see if it's already registered." ] }, { @@ -264,10 +265,18 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = dataset.register(workspace = ws,\n", - " name = 'mnist dataset',\n", - " description='training and test dataset',\n", - " create_new_version=True)" + "dataset_registered = False\n", + "try:\n", + " temp = Dataset.get_by_name(workspace = ws, name = 'mnist-dataset')\n", + " dataset_registered = True\n", + "except:\n", + " print(\"The dataset mnist-dataset is not registered in workspace yet.\")\n", + "\n", + "if not dataset_registered:\n", + " dataset = dataset.register(workspace = ws,\n", + " name = 'mnist-dataset',\n", + " description='training and test dataset',\n", + " create_new_version=True)" ] }, { @@ -634,6 +643,15 @@ "htr.wait_for_completion(show_output=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert(htr.get_status() == \"Completed\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb b/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb index be6851feb..71f1e7ec3 100644 --- a/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb +++ b/how-to-use-azureml/ml-frameworks/tensorflow/training/train-tensorflow-resume-training/train-tensorflow-resume-training.ipynb @@ -170,7 +170,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "you may want to register datasets using the register() method to your workspace so they can be shared with others, reused across various experiments, and referred to by name in your training script." + "you may want to register datasets using the register() method to your workspace so they can be shared with others, reused across various experiments, and referred to by name in your training script.\n", + "You can try get the dataset first to see if it's already registered." ] }, { @@ -179,11 +180,19 @@ "metadata": {}, "outputs": [], "source": [ - "#register dataset to workspace\n", - "dataset = dataset.register(workspace = ws,\n", - " name = 'mnist dataset',\n", - " description='training and test dataset',\n", - " create_new_version=True)" + "dataset_registered = False\n", + "try:\n", + " temp = Dataset.get_by_name(workspace = ws, name = 'mnist-dataset')\n", + " dataset_registered = True\n", + "except:\n", + " print(\"The dataset mnist-dataset is not registered in workspace yet.\")\n", + "\n", + "if not dataset_registered:\n", + " #register dataset to workspace\n", + " dataset = dataset.register(workspace = ws,\n", + " name = 'mnist-dataset',\n", + " description='training and test dataset',\n", + " create_new_version=True)" ] }, { diff --git a/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb b/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb index e1a16f212..fbc0d5065 100644 --- a/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb +++ b/how-to-use-azureml/track-and-monitor-experiments/logging-api/logging-api.ipynb @@ -100,7 +100,7 @@ "\n", "# Check core SDK version number\n", "\n", - "print(\"This notebook was created using SDK version 1.2.0, you are currently running version\", azureml.core.VERSION)" + "print(\"This notebook was created using SDK version 1.3.0, you are currently running version\", azureml.core.VERSION)" ] }, { diff --git a/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb b/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb index a8e3bdfa4..ed1519f29 100644 --- a/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb +++ b/how-to-use-azureml/training-with-deep-learning/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb @@ -243,7 +243,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Use the `register()` method to register datasets to your workspace so they can be shared with others, reused across various experiments, and referred to by name in your training script." + "Use the `register()` method to register datasets to your workspace so they can be shared with others, reused across various experiments, and referred to by name in your training script.\n", + "You can try get the dataset first to see if it's already registered." ] }, { @@ -252,10 +253,18 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = dataset.register(workspace = ws,\n", - " name = 'mnist dataset',\n", - " description='training and test dataset',\n", - " create_new_version=True)" + "dataset_registered = False\n", + "try:\n", + " temp = Dataset.get_by_name(workspace = ws, name = 'mnist-dataset')\n", + " dataset_registered = True\n", + "except:\n", + " print(\"The dataset mnist-dataset is not registered in workspace yet.\")\n", + "\n", + "if not dataset_registered:\n", + " dataset = dataset.register(workspace = ws,\n", + " name = 'mnist-dataset',\n", + " description='training and test dataset',\n", + " create_new_version=True)" ] }, { @@ -413,7 +422,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = Dataset.get_by_name(ws, 'mnist dataset')\n", + "dataset = Dataset.get_by_name(ws, 'mnist-dataset')\n", "\n", "# list the files referenced by mnist dataset\n", "dataset.to_path()" @@ -801,6 +810,15 @@ "hdr.wait_for_completion(show_output=True)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert(hdr.get_status() == \"Completed\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/how-to-use-azureml/training/train-in-spark/train-in-spark.ipynb b/how-to-use-azureml/training/train-in-spark/train-in-spark.ipynb index 7fc51604d..3cacab4b9 100644 --- a/how-to-use-azureml/training/train-in-spark/train-in-spark.ipynb +++ b/how-to-use-azureml/training/train-in-spark/train-in-spark.ipynb @@ -286,7 +286,7 @@ "metadata": { "authors": [ { - "name": "aashishb" + "name": "sanpil" } ], "category": "training", diff --git a/index.md b/index.md index 9a180c610..061dfb381 100644 --- a/index.md +++ b/index.md @@ -31,7 +31,7 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an | [Forecasting away from training data](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/forecasting-high-frequency/auto-ml-forecasting-function.ipynb) | Forecasting | None | Remote | None | Azure ML AutoML | Forecasting, Confidence Intervals | | [Automated ML run with basic edition features.](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-bank-marketing-all-features/auto-ml-classification-bank-marketing-all-features.ipynb) | Classification | Bankmarketing | AML | ACI | None | featurization, explainability, remote_run, AutomatedML | | [Classification of credit card fraudulent transactions using Automated ML](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/classification-credit-card-fraud/auto-ml-classification-credit-card-fraud.ipynb) | Classification | Creditcard | AML Compute | None | None | remote_run, AutomatedML | -| [Automated ML run with featurization and model explainability.](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression-hardware-performance-explanation-and-featurization/auto-ml-regression-hardware-performance-explanation-and-featurization.ipynb) | Regression | MachineData | AML | ACI | None | featurization, explainability, remote_run, AutomatedML | +| [Automated ML run with featurization and model explainability.](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/automated-machine-learning/regression-explanation-featurization/auto-ml-regression-explanation-featurization.ipynb) | Regression | MachineData | AML | ACI | None | featurization, explainability, remote_run, AutomatedML | | :star:[Azure Machine Learning Pipeline with DataTranferStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-data-transfer.ipynb) | Demonstrates the use of DataTranferStep | Custom | ADF | None | Azure ML | None | | [Getting Started with Azure Machine Learning Pipelines](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb) | Getting Started notebook for ANML Pipelines | Custom | AML Compute | None | Azure ML | None | | [Azure Machine Learning Pipeline with AzureBatchStep](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-how-to-use-azurebatch-to-run-a-windows-executable.ipynb) | Demonstrates the use of AzureBatchStep | Custom | Azure Batch | None | Azure ML | None | @@ -58,6 +58,7 @@ Machine Learning notebook samples and encourage efficient retrieval of topics an | [Training with hyperparameter tuning using PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/deployment/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb) | Train an image classification model using transfer learning with the PyTorch estimator | ImageNet | AML Compute | Azure Container Instance | PyTorch | None | | [Distributed PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-horovod/distributed-pytorch-with-horovod.ipynb) | Train a model using the distributed training via Horovod | MNIST | AML Compute | None | PyTorch | None | | [Distributed training with PyTorch](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/training/distributed-pytorch-with-nccl-gloo/distributed-pytorch-with-nccl-gloo.ipynb) | Train a model using distributed training via Nccl/Gloo | MNIST | AML Compute | None | PyTorch | None | +| [PyTorch object detection](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/pytorch/training/mask-rcnn-object-detection/pytorch-mask-rcnn.ipynb) | Fine-tune PyTorch object detection model with a custom dockerfile | Custom | AML Compute | None | PyTorch | remote run, docker | | [Training and hyperparameter tuning with Scikit-learn](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/scikit-learn/training/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-deploy-with-sklearn.ipynb) | Train a support vector machine (SVM) to perform classification | Iris | AML Compute | None | Scikit-learn | None | | [Training and hyperparameter tuning using the TensorFlow estimator](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/deployment/train-hyperparameter-tune-deploy-with-tensorflow/train-hyperparameter-tune-deploy-with-tensorflow.ipynb) | Train a deep neural network | MNIST | AML Compute | Azure Container Instance | TensorFlow | None | | [Distributed training using TensorFlow with Horovod](https://github.com/Azure/MachineLearningNotebooks/blob/master//how-to-use-azureml/ml-frameworks/tensorflow/training/distributed-tensorflow-with-horovod/distributed-tensorflow-with-horovod.ipynb) | Use the TensorFlow estimator to train a word2vec model | None | AML Compute | None | TensorFlow | None | diff --git a/setup-environment/configuration.ipynb b/setup-environment/configuration.ipynb index 7a7312c6d..9ce0a661d 100644 --- a/setup-environment/configuration.ipynb +++ b/setup-environment/configuration.ipynb @@ -102,7 +102,7 @@ "source": [ "import azureml.core\n", "\n", - "print(\"This notebook was created using version 1.2.0 of the Azure ML SDK\")\n", + "print(\"This notebook was created using version 1.3.0 of the Azure ML SDK\")\n", "print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")" ] }, diff --git a/tutorials/create-first-ml-experiment/imgs/experiment_main.png b/tutorials/create-first-ml-experiment/imgs/experiment_main.png index bb3e51af7..2419855bb 100644 Binary files a/tutorials/create-first-ml-experiment/imgs/experiment_main.png and b/tutorials/create-first-ml-experiment/imgs/experiment_main.png differ diff --git a/tutorials/create-first-ml-experiment/imgs/model_download.png b/tutorials/create-first-ml-experiment/imgs/model_download.png index e07fc1db6..adcdf70ec 100644 Binary files a/tutorials/create-first-ml-experiment/imgs/model_download.png and b/tutorials/create-first-ml-experiment/imgs/model_download.png differ diff --git a/tutorials/create-first-ml-experiment/tutorial-1st-experiment-sdk-train.ipynb b/tutorials/create-first-ml-experiment/tutorial-1st-experiment-sdk-train.ipynb index 167a935d8..09ac2ca5b 100644 --- a/tutorials/create-first-ml-experiment/tutorial-1st-experiment-sdk-train.ipynb +++ b/tutorials/create-first-ml-experiment/tutorial-1st-experiment-sdk-train.ipynb @@ -31,7 +31,7 @@ "\n", "> * Connect your workspace and create an experiment \n", "> * Load data and train a scikit-learn model\n", - "> * View training results in the portal\n", + "> * View training results in the studio\n", "> * Retrieve the best model" ] }, @@ -74,7 +74,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now create an experiment in your workspace. An experiment is another foundational cloud resource that represents a collection of trials (individual model runs). In this tutorial you use the experiment to create runs and track your model training in the Azure Portal. Parameters include your workspace reference, and a string name for the experiment." + "Now create an experiment in your workspace. An experiment is another foundational cloud resource that represents a collection of trials (individual model runs). In this tutorial you use the experiment to create runs and track your model training in the Azure Machine Learning studio. Parameters include your workspace reference, and a string name for the experiment." ] }, { @@ -171,7 +171,7 @@ "\n", "1. For each alpha hyperparameter value in the `alphas` array, a new run is created within the experiment. The alpha value is logged to differentiate between each run.\n", "1. In each run, a Ridge model is instantiated, trained, and used to run predictions. The root-mean-squared-error is calculated for the actual versus predicted values, and then logged to the run. At this point the run has metadata attached for both the alpha value and the rmse accuracy.\n", - "1. Next, the model for each run is serialized and uploaded to the run. This allows you to download the model file from the run in the portal.\n", + "1. Next, the model for each run is serialized and uploaded to the run. This allows you to download the model file from the run in the studio.\n", "1. At the end of each iteration the run is completed by calling `run.complete()`.\n", "\n" ] @@ -180,7 +180,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After the training has completed, call the `experiment` variable to fetch a link to the experiment in the portal." + "After the training has completed, call the `experiment` variable to fetch a link to the experiment in the studio." ] }, { @@ -196,14 +196,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## View training results in portal" + "## View training results in studio" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Following the **Link to Azure Portal** takes you to the main experiment page. Here you see all the individual runs in the experiment. Any custom-logged values (`alpha_value` and `rmse`, in this case) become fields for each run, and also become available for the charts and tiles at the top of the experiment page. To add a logged metric to a chart or tile, hover over it, click the edit button, and find your custom-logged metric.\n", + "Following the **Link to Azure Machine Learning studio** takes you to the main experiment page. Here you see all the individual runs in the experiment. Any custom-logged values (`alpha_value` and `rmse`, in this case) become fields for each run, and also become available for the charts and tiles at the top of the experiment page. To add a logged metric to a chart or tile, hover over it, click the edit button, and find your custom-logged metric.\n", "\n", "When training models at scale over hundreds and thousands of runs, this page makes it easy to see every model you trained, specifically how they were trained, and how your unique metrics have changed over time." ] @@ -212,21 +212,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "![Main Experiment page in Portal](imgs/experiment_main.png)" + "![Main Experiment page in the studio](../imgs/experiment_main.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Clicking on a run number link in the `RUN NUMBER` column takes you to the page for each individual run. The default tab **Details** shows you more-detailed information on each run. Navigate to the **Outputs** tab, and you see the `.pkl` file for the model that was uploaded to the run during each training iteration. Here you can download the model file, rather than having to retrain it manually." + "Select a run number link in the `RUN NUMBER` column to see the page for an individual run. The default tab **Details** shows you more-detailed information on each run. Navigate to the **Outputs + logs** tab, and you see the `.pkl` file for the model that was uploaded to the run during each training iteration. Here you can download the model file, rather than having to retrain it manually." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![Run details page in Portal](imgs/model_download.png)" + "![Run details page in the studio](../imgs/model_download.png)" ] }, { @@ -240,7 +240,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In addition to being able to download model files from the experiment in the portal, you can also download them programmatically. The following code iterates through each run in the experiment, and accesses both the logged run metrics and the run details (which contains the run_id). This keeps track of the best run, in this case the run with the lowest root-mean-squared-error." + "In addition to being able to download model files from the experiment in the studio, you can also download them programmatically. The following code iterates through each run in the experiment, and accesses both the logged run metrics and the run details (which contains the run_id). This keeps track of the best run, in this case the run with the lowest root-mean-squared-error." ] }, { @@ -352,7 +352,7 @@ "\n", "> * Connected your workspace and created an experiment\n", "> * Loaded data and trained scikit-learn models\n", - "> * Viewed training results in the portal and retrieved models\n", + "> * Viewed training results in the studio and retrieved models\n", "\n", "[Deploy your model](https://docs.microsoft.com/azure/machine-learning/service/tutorial-deploy-models-with-aml) with Azure Machine Learning.\n", "Learn how to develop [automated machine learning](https://docs.microsoft.com/azure/machine-learning/service/tutorial-auto-train-models) experiments." diff --git a/tutorials/image-classification-mnist-data/img-classification-part2-deploy.ipynb b/tutorials/image-classification-mnist-data/img-classification-part2-deploy.ipynb index de8090f39..f1d54c4c2 100644 --- a/tutorials/image-classification-mnist-data/img-classification-part2-deploy.ipynb +++ b/tutorials/image-classification-mnist-data/img-classification-part2-deploy.ipynb @@ -39,11 +39,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [ - "register model from file" - ] - }, + "metadata": {}, "outputs": [], "source": [ "# If you did NOT complete the tutorial, you can instead run this cell \n", @@ -62,7 +58,19 @@ " model_name=model_name,\n", " tags={\"data\": \"mnist\", \"model\": \"classification\"},\n", " description=\"Mnist handwriting recognition\",\n", - " workspace=ws)" + " workspace=ws)\n", + "\n", + "from azureml.core.environment import Environment\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "\n", + "# to install required packages\n", + "env = Environment('tutorial-env')\n", + "cd = CondaDependencies.create(pip_packages=['azureml-dataprep[pandas,fuse]>=1.1.14', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1'])\n", + "\n", + "env.python.conda_dependencies = cd\n", + "\n", + "# Register environment to re-use later\n", + "env.register(workspace = ws)" ] }, { @@ -98,190 +106,16 @@ "print(\"Azure ML SDK Version: \", azureml.core.VERSION)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Retrieve the model\n", - "\n", - "You registered a model in your workspace in the previous tutorial. Now, load this workspace and download the model to your local directory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "load workspace", - "download model" - ] - }, - "outputs": [], - "source": [ - "from azureml.core import Workspace\n", - "from azureml.core.model import Model\n", - "import os \n", - "ws = Workspace.from_config()\n", - "model=Model(ws, 'sklearn_mnist')\n", - "\n", - "model.download(target_dir=os.getcwd(), exist_ok=True)\n", - "\n", - "# verify the downloaded model file\n", - "file_path = os.path.join(os.getcwd(), \"sklearn_mnist_model.pkl\")\n", - "\n", - "os.stat(file_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test model locally\n", - "\n", - "Before deploying, make sure your model is working locally by:\n", - "* Downloading the test data if you haven't already\n", - "* Loading test data\n", - "* Predicting test data\n", - "* Examining the confusion matrix" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Download test data\n", - "If you haven't already, download the test data to the **./data/** directory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core import Dataset\n", - "from azureml.opendatasets import MNIST\n", - "\n", - "data_folder = os.path.join(os.getcwd(), 'data')\n", - "os.makedirs(data_folder, exist_ok=True)\n", - "\n", - "mnist_file_dataset = MNIST.get_file_dataset()\n", - "mnist_file_dataset.download(data_folder, overwrite=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load test data\n", - "\n", - "Load the test data from the **./data/** directory created during the training tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from utils import load_data\n", - "import os\n", - "\n", - "data_folder = os.path.join(os.getcwd(), 'data')\n", - "# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster\n", - "X_test = load_data(os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'), False) / 255.0\n", - "y_test = load_data(os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'), True).reshape(-1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Predict test data\n", - "\n", - "Feed the test dataset to the model to get predictions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pickle\n", - "import joblib\n", - "\n", - "clf = joblib.load( os.path.join(os.getcwd(), 'sklearn_mnist_model.pkl'))\n", - "y_hat = clf.predict(X_test)\n", - "print(y_hat)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Examine the confusion matrix\n", - "\n", - "Generate a confusion matrix to see how many samples from the test set are classified correctly. Notice the mis-classified value for the incorrect predictions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import confusion_matrix\n", - "\n", - "conf_mx = confusion_matrix(y_test, y_hat)\n", - "print(conf_mx)\n", - "print('Overall accuracy:', np.average(y_hat == y_test))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use `matplotlib` to display the confusion matrix as a graph. In this graph, the X axis represents the actual values, and the Y axis represents the predicted values. The color in each grid represents the error rate. The lighter the color, the higher the error rate is. For example, many 5's are mis-classified as 3's. Hence you see a bright grid at (5,3)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# normalize the diagonal cells so that they don't overpower the rest of the cells when visualized\n", - "row_sums = conf_mx.sum(axis=1, keepdims=True)\n", - "norm_conf_mx = conf_mx / row_sums\n", - "np.fill_diagonal(norm_conf_mx, 0)\n", - "\n", - "fig = plt.figure(figsize=(8,5))\n", - "ax = fig.add_subplot(111)\n", - "cax = ax.matshow(norm_conf_mx, cmap=plt.cm.bone)\n", - "ticks = np.arange(0, 10, 1)\n", - "ax.set_xticks(ticks)\n", - "ax.set_yticks(ticks)\n", - "ax.set_xticklabels(ticks)\n", - "ax.set_yticklabels(ticks)\n", - "fig.colorbar(cax)\n", - "plt.ylabel('true labels', fontsize=14)\n", - "plt.xlabel('predicted values', fontsize=14)\n", - "plt.savefig('conf.png')\n", - "plt.show()" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Deploy as web service\n", "\n", - "Once you've tested the model and are satisfied with the results, deploy the model as a web service hosted in ACI. \n", + "Deploy the model as a web service hosted in ACI. \n", "\n", "To build the correct environment for ACI, provide the following:\n", "* A scoring script to show how to use the model\n", - "* An environment file to show what packages need to be installed\n", "* A configuration file to build the ACI\n", "* The model you trained before\n", "\n", @@ -324,52 +158,6 @@ " return y_hat.tolist()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create environment file\n", - "\n", - "Next, create an environment file, called myenv.yml, that specifies all of the script's package dependencies. This file is used to ensure that all of those dependencies are installed in the Docker image. This model needs `scikit-learn` and `azureml-sdk`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "set conda dependencies" - ] - }, - "outputs": [], - "source": [ - "from azureml.core.conda_dependencies import CondaDependencies \n", - "\n", - "myenv = CondaDependencies()\n", - "myenv.add_conda_package(\"scikit-learn==0.22.1\")\n", - "myenv.add_pip_package(\"azureml-defaults\")\n", - "\n", - "with open(\"myenv.yml\",\"w\") as f:\n", - " f.write(myenv.serialize_to_string())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Review the content of the `myenv.yml` file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(\"myenv.yml\",\"r\") as f:\n", - " print(f.read())" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -432,6 +220,11 @@ "from azureml.core.webservice import Webservice\n", "from azureml.core.model import InferenceConfig\n", "from azureml.core.environment import Environment\n", + "from azureml.core import Workspace\n", + "from azureml.core.model import Model\n", + "\n", + "ws = Workspace.from_config()\n", + "model = Model(ws, 'sklearn_mnist')\n", "\n", "\n", "myenv = Environment.get(workspace=ws, name=\"tutorial-env\", version=\"1\")\n", @@ -470,14 +263,148 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Test deployed service\n", + "## Test the model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download test data\n", + "Download the test data to the **./data/** directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from azureml.core import Dataset\n", + "from azureml.opendatasets import MNIST\n", + "\n", + "data_folder = os.path.join(os.getcwd(), 'data')\n", + "os.makedirs(data_folder, exist_ok=True)\n", + "\n", + "mnist_file_dataset = MNIST.get_file_dataset()\n", + "mnist_file_dataset.download(data_folder, overwrite=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load test data\n", + "\n", + "Load the test data from the **./data/** directory created during the training tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import load_data\n", + "import os\n", + "\n", + "data_folder = os.path.join(os.getcwd(), 'data')\n", + "# note we also shrink the intensity values (X) from 0-255 to 0-1. This helps the neural network converge faster\n", + "X_test = load_data(os.path.join(data_folder, 't10k-images-idx3-ubyte.gz'), False) / 255.0\n", + "y_test = load_data(os.path.join(data_folder, 't10k-labels-idx1-ubyte.gz'), True).reshape(-1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Predict test data\n", + "\n", + "Feed the test dataset to the model to get predictions.\n", "\n", - "Earlier you scored all the test data with the local version of the model. Now, you can test the deployed model with a random sample of 30 images from the test data. \n", "\n", "The following code goes through these steps:\n", "1. Send the data as a JSON array to the web service hosted in ACI. \n", "\n", - "1. Use the SDK's `run` API to invoke the service. You can also make raw calls using any HTTP tool such as curl.\n", + "1. Use the SDK's `run` API to invoke the service. You can also make raw calls using any HTTP tool such as curl." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "test = json.dumps({\"data\": X_test.tolist()})\n", + "test = bytes(test, encoding='utf8')\n", + "y_hat = service.run(input_data=test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Examine the confusion matrix\n", + "\n", + "Generate a confusion matrix to see how many samples from the test set are classified correctly. Notice the mis-classified value for the incorrect predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "\n", + "conf_mx = confusion_matrix(y_test, y_hat)\n", + "print(conf_mx)\n", + "print('Overall accuracy:', np.average(y_hat == y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use `matplotlib` to display the confusion matrix as a graph. In this graph, the X axis represents the actual values, and the Y axis represents the predicted values. The color in each grid represents the error rate. The lighter the color, the higher the error rate is. For example, many 5's are mis-classified as 3's. Hence you see a bright grid at (5,3)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# normalize the diagonal cells so that they don't overpower the rest of the cells when visualized\n", + "row_sums = conf_mx.sum(axis=1, keepdims=True)\n", + "norm_conf_mx = conf_mx / row_sums\n", + "np.fill_diagonal(norm_conf_mx, 0)\n", + "\n", + "fig = plt.figure(figsize=(8,5))\n", + "ax = fig.add_subplot(111)\n", + "cax = ax.matshow(norm_conf_mx, cmap=plt.cm.bone)\n", + "ticks = np.arange(0, 10, 1)\n", + "ax.set_xticks(ticks)\n", + "ax.set_yticks(ticks)\n", + "ax.set_xticklabels(ticks)\n", + "ax.set_yticklabels(ticks)\n", + "fig.colorbar(cax)\n", + "plt.ylabel('true labels', fontsize=14)\n", + "plt.xlabel('predicted values', fontsize=14)\n", + "plt.savefig('conf.png')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Show predictions\n", + "\n", + "Test the deployed model with a random sample of 30 images from the test data. \n", + "\n", "\n", "1. Print the returned predictions and plot them along with the input images. Red font and inverse image (white on black) is used to highlight the misclassified samples. \n", "\n",