From 2b3ac0a589ab595b8d3cc72cf5a6077cbcf0785c Mon Sep 17 00:00:00 2001 From: Samuel Kemp Date: Tue, 18 Apr 2023 17:23:28 +0100 Subject: [PATCH] delta example and up-version mltable (#2205) --- ...-delta-lake-example-delta-lake-example.yml | 75 +++++ sdk/python/README.md | 1 + sdk/python/using-mltable/README.md | 1 + .../job-env/conda_dependencies.yml | 4 +- .../delta-lake-example.ipynb | 290 ++++++++++++++++++ .../job-env/conda_dependencies.yml | 6 + .../src/train.py | 0 .../job-env/conda_dependencies.yml | 4 +- .../using-mltable/mltable-requirements.txt | 4 +- .../quickstart/job-env/conda_dependencies.yml | 4 +- 10 files changed, 381 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml create mode 100644 sdk/python/using-mltable/delta-lake-example/delta-lake-example.ipynb create mode 100644 sdk/python/using-mltable/delta-lake-example/job-env/conda_dependencies.yml rename sdk/python/using-mltable/{local-to-cloud/job-env => delta-lake-example}/src/train.py (100%) diff --git a/.github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml b/.github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml new file mode 100644 index 0000000000..c5e8a19488 --- /dev/null +++ b/.github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml @@ -0,0 +1,75 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: sdk-using-mltable-delta-lake-example-delta-lake-example +# This file is created by sdk/python/readme.py. +# Please do not edit directly. +on: + workflow_dispatch: + schedule: + - cron: "23 8/12 * * *" + pull_request: + branches: + - main + paths: + - sdk/python/using-mltable/delta-lake-example/** + - .github/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml + - sdk/python/dev-requirements.txt + - infra/** + - sdk/python/setup.sh +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: check out repo + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + - name: pip install notebook reqs + run: pip install -r sdk/python/dev-requirements.txt + - name: azure login + uses: azure/login@v1 + with: + creds: ${{secrets.AZUREML_CREDENTIALS}} + - name: bootstrap resources + run: | + echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; + bash bootstrap.sh + working-directory: infra + continue-on-error: false + - name: setup SDK + run: | + source "${{ github.workspace }}/infra/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/init_environment.sh"; + bash setup.sh + working-directory: sdk/python + continue-on-error: true + - name: setup-cli + run: | + source "${{ github.workspace }}/infra/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/init_environment.sh"; + bash setup.sh + working-directory: cli + continue-on-error: true + - name: run using-mltable/delta-lake-example/delta-lake-example.ipynb + run: | + source "${{ github.workspace }}/infra/sdk_helpers.sh"; + source "${{ github.workspace }}/infra/init_environment.sh"; + bash "${{ github.workspace }}/infra/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; + bash "${{ github.workspace }}/infra/sdk_helpers.sh" replace_template_values "delta-lake-example.ipynb"; + [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; + papermill -k python delta-lake-example.ipynb delta-lake-example.output.ipynb + working-directory: sdk/python/using-mltable/delta-lake-example + - name: upload notebook's working folder as an artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: delta-lake-example + path: sdk/python/using-mltable/delta-lake-example diff --git a/sdk/python/README.md b/sdk/python/README.md index 419209f2e1..805aec46c4 100644 --- a/sdk/python/README.md +++ b/sdk/python/README.md @@ -166,6 +166,7 @@ Test Status is for branch - **_main_** |using-mlflow|train-and-log|[xgboost_service_principal](using-mlflow/train-and-log/xgboost_service_principal.ipynb)|*no description* - _This sample is excluded from automated tests_|[![xgboost_service_principal](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-train-and-log-xgboost_service_principal.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-train-and-log-xgboost_service_principal.yml)| |using-mlflow|using-rest-api|[using_mlflow_rest_api](using-mlflow/using-rest-api/using_mlflow_rest_api.ipynb)|*no description* - _This sample is excluded from automated tests_|[![using_mlflow_rest_api](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-using-rest-api-using_mlflow_rest_api.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mlflow-using-rest-api-using_mlflow_rest_api.yml)| |using-mltable|delimited-files-example|[delimited-files-example](using-mltable/delimited-files-example/delimited-files-example.ipynb)|*no description*|[![delimited-files-example](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delimited-files-example-delimited-files-example.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delimited-files-example-delimited-files-example.yml)| +|using-mltable|delta-lake-example|[delta-lake-example](using-mltable/delta-lake-example/delta-lake-example.ipynb)|*no description*|[![delta-lake-example](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-delta-lake-example-delta-lake-example.yml)| |using-mltable|from-paths-example|[from-paths-example](using-mltable/from-paths-example/from-paths-example.ipynb)|*no description*|[![from-paths-example](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-from-paths-example-from-paths-example.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-from-paths-example-from-paths-example.yml)| |using-mltable|local-to-cloud|[mltable-local-to-cloud](using-mltable/local-to-cloud/mltable-local-to-cloud.ipynb)|*no description*|[![mltable-local-to-cloud](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-local-to-cloud-mltable-local-to-cloud.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-local-to-cloud-mltable-local-to-cloud.yml)| |using-mltable|quickstart|[mltable-quickstart](using-mltable/quickstart/mltable-quickstart.ipynb)|*no description*|[![mltable-quickstart](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-quickstart-mltable-quickstart.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-using-mltable-quickstart-mltable-quickstart.yml)| diff --git a/sdk/python/using-mltable/README.md b/sdk/python/using-mltable/README.md index 8250957ed1..327898c07b 100644 --- a/sdk/python/using-mltable/README.md +++ b/sdk/python/using-mltable/README.md @@ -21,4 +21,5 @@ For more information on Azure ML Tables, read [Working with tables in Azure ML]( | [Azure ML Tables Quickstart](./quickstart/mltable-quickstart.ipynb) | *Demonstrates an end-to-end example of using MLTable, including asset creation, loading into both interactive sessions and jobs. The data is in parquet format.* | | [Azure ML Tables Local-to-Cloud](./local-to-cloud/mltable-local-to-cloud.ipynb) | *Demonstrates how to work with data and tables locally and upload to the cloud as a data asset for improved sharing and reproducibility.* | | [Create an Azure ML Table from Delimited Text Files (CSV)](./delimited-files-example/delimited-files-example.ipynb) | *Demonstrates creating an MLTable from delimited files (CSV).* | +| [Create an Azure ML Table from Delta Lake table](./delta-lake-example/delta-lake-example.ipynb) | *Demonstrates creating an MLTable from a data lake table on Azure storage.* | | [Create an Azure ML Table of paths](./from-paths-example/from-paths-example.ipynb) | *Demonstrates creating a Table of paths on cloud storage that can then be streamed into a Python session.* | diff --git a/sdk/python/using-mltable/delimited-files-example/job-env/conda_dependencies.yml b/sdk/python/using-mltable/delimited-files-example/job-env/conda_dependencies.yml index b765e79061..2f335badb8 100644 --- a/sdk/python/using-mltable/delimited-files-example/job-env/conda_dependencies.yml +++ b/sdk/python/using-mltable/delimited-files-example/job-env/conda_dependencies.yml @@ -2,5 +2,5 @@ dependencies: - python=3.10 - pip=21.2.4 - pip: - - mltable==1.2.0 - - azureml-dataprep[pandas]==4.9.5 \ No newline at end of file + - mltable==1.3.0 + - azureml-dataprep[pandas]==4.10.6 \ No newline at end of file diff --git a/sdk/python/using-mltable/delta-lake-example/delta-lake-example.ipynb b/sdk/python/using-mltable/delta-lake-example/delta-lake-example.ipynb new file mode 100644 index 0000000000..e0bde714e8 --- /dev/null +++ b/sdk/python/using-mltable/delta-lake-example/delta-lake-example.ipynb @@ -0,0 +1,290 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create a Table from Delta Lake\n", + "\n", + "In this example notebook you will create an AzureML Table from a Delta Table." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 📦 Install dependencies\n", + "\n", + "Ensure you have the latest MLTable library and dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r ../mltable-requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🐍 Create an MLTable using the Python SDK\n", + "\n", + "Here you build your data loading steps using the `mltable` Python SDK. The `show()` method allows you to see the effect of the data loading transformation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mltable\n", + "\n", + "# create paths to the data files\n", + "delta_table_uri = \"wasbs://data@azuremlexampledata.blob.core.windows.net/COVID-19_NYT\"\n", + "\n", + "# create an MLTable from the data files\n", + "tbl = mltable.from_delta_lake(delta_table_uri, timestamp_as_of=\"2022-10-01T00:00:00Z\")\n", + "\n", + "# show the first 5 records\n", + "tbl.show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🐼 Load into a Pandas data frame\n", + "\n", + "You can load your Azure ML Table into Pandas using:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = tbl.to_pandas_dataframe()\n", + "df.head(5)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 💾 Save data loading steps \n", + "Next, you'll save all your data loading steps into an `MLTable` file. This allows you to *reproduce* your Pandas data frame at a later point in time without having to redefine the data loading steps in your code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# save the data loading steps in an MLTable file\n", + "tbl.save(\"./covid\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 🔍 View the saved file\n", + "\n", + "In the next code cell, we show you the `MLTable` file so you can understand how the data loading steps are serialized into a file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"./covid/MLTable\", \"r\") as f:\n", + " print(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ♻️ Reproduce data loading steps\n", + "\n", + "Now that the data loading steps have been serialized into a file, you can reproduce them at any point in time using the `load()` method. This means you do not need to redefine your data loading steps in code and makes it easier to share with others." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mltable\n", + "\n", + "# load the previously saved MLTable file\n", + "tbl = mltable.load(\"./covid/\")\n", + "df = tbl.to_pandas_dataframe()\n", + "df.head(5)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 🤝 Create a data asset to aid sharing and reproducibility\n", + "\n", + "You'll now create a data asset, which will automatically upload the `MLTable` to cloud storage (the default AzureML datastore) so that others can use it easily." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subscription_id = \"\"\n", + "resource_group = \"\"\n", + "workspace = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from azure.ai.ml import MLClient\n", + "from azure.ai.ml.entities import Data\n", + "from azure.ai.ml.constants import AssetTypes\n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "# set the version number of the data asset to the current UTC time\n", + "VERSION = time.strftime(\"%Y.%m.%d.%H%M%S\", time.gmtime())\n", + "\n", + "# connect to the AzureML workspace\n", + "ml_client = MLClient(\n", + " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", + ")\n", + "\n", + "my_data = Data(\n", + " path=\"./covid\",\n", + " type=AssetTypes.MLTABLE,\n", + " description=\"COVID-19 dataset.\",\n", + " name=\"covid-delta-example\",\n", + " version=VERSION,\n", + ")\n", + "\n", + "ml_client.data.create_or_update(my_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 📖 Read the data asset in an interactive session\n", + "\n", + "Now you have your MLTable stored in the cloud, you and Team members can access it using a friendly name in an interactive session (for example, a notebook)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mltable\n", + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "# connect to the AzureML workspace\n", + "ml_client = MLClient(\n", + " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", + ")\n", + "\n", + "# get the latest version of the data asset\n", + "# Note: The version was set in the previous code cell.\n", + "data_asset = ml_client.data.get(name=\"covid-delta-example\", version=VERSION)\n", + "\n", + "# create a table\n", + "tbl = mltable.load(f\"azureml:/{data_asset.id}\")\n", + "\n", + "# load into pandas\n", + "df = tbl.to_pandas_dataframe()\n", + "df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 📖 Read the data asset in a job\n", + "\n", + "You can also access your Table in a job, using:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient, command, Input\n", + "from azure.ai.ml.entities import Environment\n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "# connect to the AzureML workspace\n", + "ml_client = MLClient(\n", + " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", + ")\n", + "\n", + "# get the latest version of the data asset\n", + "# Note: the VERSION was set in a previous cell.\n", + "data_asset = ml_client.data.get(name=\"covid-delta-example\", version=VERSION)\n", + "\n", + "job = command(\n", + " command=\"python train.py --input ${{inputs.titanic}}\",\n", + " inputs={\"titanic\": Input(type=\"mltable\", path=data_asset.id)},\n", + " compute=\"cpu-cluster\",\n", + " environment=Environment(\n", + " image=\"mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04\",\n", + " conda_file=\"./job-env/conda_dependencies.yml\",\n", + " ),\n", + " code=\"./src\",\n", + ")\n", + "\n", + "ml_client.jobs.create_or_update(job)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10 - SDK V2", + "language": "python", + "name": "python310-sdkv2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/using-mltable/delta-lake-example/job-env/conda_dependencies.yml b/sdk/python/using-mltable/delta-lake-example/job-env/conda_dependencies.yml new file mode 100644 index 0000000000..2f335badb8 --- /dev/null +++ b/sdk/python/using-mltable/delta-lake-example/job-env/conda_dependencies.yml @@ -0,0 +1,6 @@ +dependencies: + - python=3.10 + - pip=21.2.4 + - pip: + - mltable==1.3.0 + - azureml-dataprep[pandas]==4.10.6 \ No newline at end of file diff --git a/sdk/python/using-mltable/local-to-cloud/job-env/src/train.py b/sdk/python/using-mltable/delta-lake-example/src/train.py similarity index 100% rename from sdk/python/using-mltable/local-to-cloud/job-env/src/train.py rename to sdk/python/using-mltable/delta-lake-example/src/train.py diff --git a/sdk/python/using-mltable/local-to-cloud/job-env/conda_dependencies.yml b/sdk/python/using-mltable/local-to-cloud/job-env/conda_dependencies.yml index b765e79061..2f335badb8 100644 --- a/sdk/python/using-mltable/local-to-cloud/job-env/conda_dependencies.yml +++ b/sdk/python/using-mltable/local-to-cloud/job-env/conda_dependencies.yml @@ -2,5 +2,5 @@ dependencies: - python=3.10 - pip=21.2.4 - pip: - - mltable==1.2.0 - - azureml-dataprep[pandas]==4.9.5 \ No newline at end of file + - mltable==1.3.0 + - azureml-dataprep[pandas]==4.10.6 \ No newline at end of file diff --git a/sdk/python/using-mltable/mltable-requirements.txt b/sdk/python/using-mltable/mltable-requirements.txt index 918dfdb799..5f9af1e4b9 100644 --- a/sdk/python/using-mltable/mltable-requirements.txt +++ b/sdk/python/using-mltable/mltable-requirements.txt @@ -1,2 +1,2 @@ -mltable==1.2.0 -azureml-dataprep[pandas]==4.9.5 \ No newline at end of file +mltable==1.3.0 +azureml-dataprep[pandas]==4.10.6 \ No newline at end of file diff --git a/sdk/python/using-mltable/quickstart/job-env/conda_dependencies.yml b/sdk/python/using-mltable/quickstart/job-env/conda_dependencies.yml index 58f40dd2e9..2f335badb8 100644 --- a/sdk/python/using-mltable/quickstart/job-env/conda_dependencies.yml +++ b/sdk/python/using-mltable/quickstart/job-env/conda_dependencies.yml @@ -2,5 +2,5 @@ dependencies: - python=3.10 - pip=21.2.4 - pip: - - mltable==1.1.0 - - azureml-dataprep[pandas] \ No newline at end of file + - mltable==1.3.0 + - azureml-dataprep[pandas]==4.10.6 \ No newline at end of file