diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/crack_and_chunk_with_doc_intel_parallel_component.ipynb b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/crack_and_chunk_with_doc_intel_parallel_component.ipynb index 8428faac50..489772e45f 100644 --- a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/crack_and_chunk_with_doc_intel_parallel_component.ipynb +++ b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/crack_and_chunk_with_doc_intel_parallel_component.ipynb @@ -17,9 +17,10 @@ }, "outputs": [], "source": [ - "%pip install -U azure-ai-ml>=1.10\n", + "%pip install -U azure-ai-ml==1.16.0b1 # versions may change later\n", "%pip install azure-identity\n", - "%pip install -U 'azureml-rag[azure,cognitive_search]==0.2.28'" + "%pip install -U 'azureml-rag[azure,cognitive_search]>=0.2.28'\n", + "%pip install promptflow-rag==0.1.0" ] }, { diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/d_parallel_crack_pdfs_with_azure_document_intelligence.ipynb b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/d_parallel_crack_pdfs_with_azure_document_intelligence.ipynb index 3dbf0c7403..91956f001a 100644 --- a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/d_parallel_crack_pdfs_with_azure_document_intelligence.ipynb +++ b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/d_parallel_crack_pdfs_with_azure_document_intelligence.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "# Create an Index with custom cracking and chunking (in Parallel) using Azure Document Intelligence\n", "\n", @@ -11,25 +10,35 @@ "This notebook does crack_and_chunk with parallel processing. It runs on a compute cluster with multiple nodes and each has multiple processes, the input files are divided into mini batches and assigned to each process. All processes do the crack_and_chunk in parallel.\n", "\n", "This version works with generate_embeddings serial component, with set_automatic_compoute() function." - ] + ], + "metadata": {} }, { "cell_type": "code", + "source": [ + "%pip install -U azure-ai-ml==1.16.0b1 # versions may change later\n", + "%pip install azure-identity\n", + "%pip install -U 'azureml-rag[azure,cognitive_search]>=0.2.28'\n", + "%pip install promptflow-rag==0.1.0" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { "logged": 1712725306438 } - }, - "outputs": [], - "source": [ - "%pip install -U azure-ai-ml>=1.10\n", - "%pip install azure-identity\n", - "%pip install -U 'azureml-rag[azure,cognitive_search]==0.2.28'" - ] + } }, { "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "from azure.ai.ml import Input, Output, command, load_component\n", + "from azure.ai.ml.entities import BuildContext, Environment\n", + "from azure.ai.ml.constants import AssetTypes, InputOutputModes\n", + "from azure.ai.ml.parallel import parallel_run_function, RunFunction" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { @@ -44,25 +53,10 @@ "deleting": false } } - }, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "from azure.ai.ml import Input, Output, command, load_component\n", - "from azure.ai.ml.entities import BuildContext, Environment\n", - "from azure.ai.ml.constants import AssetTypes, InputOutputModes\n", - "from azure.ai.ml.parallel import parallel_run_function, RunFunction" - ] + } }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "microsoft": { - "language": "plaintext" - } - }, - "outputs": [], "source": [ "%%writefile config.json\n", "{\n", @@ -70,17 +64,17 @@ " \"resource_group\": \"\",\n", " \"workspace_name\": \"\"\n", "}" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { - "gather": { - "logged": 1714025250362 + "microsoft": { + "language": "plaintext" } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", "from azure.ai.ml import MLClient\n", @@ -95,26 +89,17 @@ " credential = InteractiveBrowserCredential()\n", "\n", "ml_client = MLClient.from_config(credential=credential)" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714025250575 - }, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } + "logged": 1714025250362 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "from pathlib import Path\n", "from azure.ai.ml.entities import BuildContext, Environment\n", @@ -129,34 +114,43 @@ "llm_rag_embeddings_doc_intel_environment = ml_client.environments.get(\n", " name=\"llm_rag_embeddings_doc_intel\", version=\"6\"\n", ")" - ] - }, - { - "cell_type": "markdown", + ], + "outputs": [], + "execution_count": null, "metadata": { + "gather": { + "logged": 1714025250575 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, "nteract": { "transient": { "deleting": false } } - }, + } + }, + { + "cell_type": "markdown", "source": [ "Define the crack_and_chunk_with_doc_intel_component_parallel which can be used in place of the crack_and_chunk_parallel Component in Vector Index creation Pipelines.\n", "\n", "Please reference this article for parallel job setup of ML pipeline. https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-parallel-job-in-pipeline?view=azureml-api-2&tabs=python \n", "\n", "Please reference this article for setting up optimum parameters of parallel job https://microsoft.github.io/azureml-ops-accelerator/4-Migrate/3-PerformanceTunePRS.html" - ] + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714025250701 - } - }, - "outputs": [], "source": [ "crack_and_chunk_with_doc_intel_component_parallel = parallel_run_function(\n", " # version=\"0.0.1\",\n", @@ -245,24 +239,24 @@ " environment=llm_rag_embeddings_doc_intel_environment,\n", " ),\n", ")" - ] + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714025250701 + } + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Define pipeline using the custom `crack_and_chunk_with_doc_intel_component_parallel along with the AzureML provided Components to embed and index your data." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714025252217 - } - }, - "outputs": [], "source": [ "ml_registry = MLClient(credential=ml_client._credential, registry_name=\"azureml\")\n", "\n", @@ -278,17 +272,17 @@ "register_mlindex_asset_component = ml_registry.components.get(\n", " \"llm_rag_register_mlindex_asset\", label=\"latest\"\n", ")" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714025252508 + "logged": 1714025252217 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "from azure.ai.ml import Input, Output\n", "from azure.ai.ml.dsl import pipeline\n", @@ -382,32 +376,38 @@ " \"mlindex_asset_uri\": update_acs_index.outputs.index,\n", " \"mlindex_asset_id\": register_mlindex.outputs.asset_id,\n", " }" - ] + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714025252508 + } + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Get the connections to Azure OpenAI (for embeddings with `text-embedding-ada-002`) and Azure Cognitive Search." - ] + ], + "metadata": {} }, { "cell_type": "code", + "source": [ + "aoai_connection = ml_client.connections.get(\"AOAI-westus\")\n", + "acs_connection = ml_client.connections.get(\"cog-serch-westus\")" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { "logged": 1714025253221 } - }, - "outputs": [], - "source": [ - "aoai_connection = ml_client.connections.get(\"AOAI-westus\")\n", - "acs_connection = ml_client.connections.get(\"cog-serch-westus\")" - ] + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Create a Custom Connection with details for an Azure AI Document Intelligence Service.\n", "[Setup instructions for Azure AI Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-3.1.0)\n", @@ -415,33 +415,27 @@ "Use the Connections UI in an AzureML Workspace, under the Promptflow tab, to create a connection with these fields: ![custom_doc_intel_connection.png](./assets/custom_doc_intel_connection.png)\n", "\n", "It's not yet supported to create/retrieve Custom Connections using SDK, so you will need to create it using the UI and we'll use string replacement below to get the ID for this custom connection to pass to our pipeline." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714025253367 - } - }, - "outputs": [], "source": [ "document_intelligence_connection_id = aoai_connection.id.replace(\n", " \"AOAI-westus\", \"doc-intelligence\"\n", ")\n", "document_intelligence_connection_id" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714025253534 + "logged": 1714025253367 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "import json\n", "from azure.ai.ml import Input\n", @@ -452,7 +446,7 @@ "\n", "data_source = \"wasbs://doc-intelligence-container@storageaccount.blob.core.windows.net\"\n", "# data_source = Path.cwd() / \"pdfs\"\n", - "asset_name = f\"beilei-index-simple\"\n", + "asset_name = f\"my-index-simple\"\n", "\n", "pipeline_job = uri_to_acs_parallel(\n", " input_data=Input(type=\"uri_folder\", path=str(data_source)),\n", @@ -482,17 +476,17 @@ "pipeline_job.properties[\"azureml.mlIndexAssetName\"] = asset_name\n", "pipeline_job.properties[\"azureml.mlIndexAssetKind\"] = \"acs\"\n", "pipeline_job.properties[\"azureml.mlIndexAssetSource\"] = \"AzureML Data\"" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714025258052 + "logged": 1714025253534 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "submission_name = f\"crack-parallel-embedding-serial\"\n", "print(f\"Submitting pipeline job to experiment: {submission_name}\")\n", @@ -501,42 +495,40 @@ ")\n", "\n", "print(f\"Submitted run, url: {running_pipeline_job.studio_url}\")" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714025789174 + "logged": 1714025258052 } - }, - "outputs": [], - "source": [ - "ml_client.jobs.stream(running_pipeline_job.name)" - ] + } }, { "cell_type": "code", + "source": [ + "ml_client.jobs.stream(running_pipeline_job.name)" + ], + "outputs": [], "execution_count": null, "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } + "gather": { + "logged": 1714025789174 } - }, - "outputs": [], - "source": [ - "%pip install azure-search-documents==11.4.0b8" - ] + } }, { "cell_type": "code", + "source": [ + "from promptflow.rag import get_langchain_retriever_from_index\n", + "\n", + "question = \"What are the employees benefits\"\n", + "\n", + "my_index = ml_client.data.get(name=asset_name, label=\"latest\")\n", + "index_langchain_retriever = get_langchain_retriever_from_index(my_index.path)\n", + "index_langchain_retriever.get_relevant_documents(question)" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { @@ -551,18 +543,7 @@ "deleting": false } } - }, - "outputs": [], - "source": [ - "from azureml.rag.mlindex import MLIndex\n", - "\n", - "question = \"What are the employees benefits\"\n", - "\n", - "retriever = MLIndex(\n", - " ml_client.data.get(asset_name, label=\"latest\")\n", - ").as_langchain_retriever()\n", - "retriever.get_relevant_documents(question)" - ] + } } ], "metadata": { @@ -578,21 +559,21 @@ "name": "python3" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "name": "python3", "language": "python", - "name": "python3" + "display_name": "Python 3 (ipykernel)" }, "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "nbconvert_exporter": "python", + "file_extension": ".py" }, "microsoft": { "host": { diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/e_parallel_crack_pdfs_with_azure_document_intelligence.ipynb b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/e_parallel_crack_pdfs_with_azure_document_intelligence.ipynb index 72aaa61a96..aeae68d802 100644 --- a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/e_parallel_crack_pdfs_with_azure_document_intelligence.ipynb +++ b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/e_parallel_crack_pdfs_with_azure_document_intelligence.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "# Create an Index with custom cracking and chunking (in Parallel) and generate embeddings (in Parallel) using Azure Document Intelligence\n", "\n", @@ -11,25 +10,35 @@ "This notebook has two components do parallel processing:\n", "- crack_and_chunk_with_doc_intel_parallel. This component is created for custom crack_and_chunk. It runs on a compute cluster with multiple nodes and each has multiple processes, the input files are divided into mini batches and assigned to each process. All processes do the crack_and_chunk in parallel.\n", "- generate_embeddings_parallel. This is a build-in component provided by Azure Designer. User can set parallel parameters, like instance count, max_concurrency_per_instance, mini_batch_size, etc." - ] + ], + "metadata": {} }, { "cell_type": "code", + "source": [ + "%pip install -U azure-ai-ml==1.16.0b1 # versions may change later\n", + "%pip install azure-identity\n", + "%pip install -U 'azureml-rag[azure,cognitive_search]>=0.2.28'\n", + "%pip install promptflow-rag==0.1.0" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { "logged": 1712725306438 } - }, - "outputs": [], - "source": [ - "%pip install -U azure-ai-ml>=1.10\n", - "%pip install azure-identity\n", - "%pip install -U 'azureml-rag[azure,cognitive_search]==0.2.28'" - ] + } }, { "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "from azure.ai.ml import Input, Output, command, load_component\n", + "from azure.ai.ml.entities import BuildContext, Environment\n", + "from azure.ai.ml.constants import AssetTypes, InputOutputModes\n", + "from azure.ai.ml.parallel import parallel_run_function, RunFunction" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { @@ -44,25 +53,10 @@ "deleting": false } } - }, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "from azure.ai.ml import Input, Output, command, load_component\n", - "from azure.ai.ml.entities import BuildContext, Environment\n", - "from azure.ai.ml.constants import AssetTypes, InputOutputModes\n", - "from azure.ai.ml.parallel import parallel_run_function, RunFunction" - ] + } }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "microsoft": { - "language": "plaintext" - } - }, - "outputs": [], "source": [ "%%writefile config.json\n", "{\n", @@ -70,17 +64,17 @@ " \"resource_group\": \"\",\n", " \"workspace_name\": \"\"\n", "}" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { - "gather": { - "logged": 1714024787236 + "microsoft": { + "language": "plaintext" } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", "from azure.ai.ml import MLClient\n", @@ -95,26 +89,17 @@ " credential = InteractiveBrowserCredential()\n", "\n", "ml_client = MLClient.from_config(credential=credential)" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714024787436 - }, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } + "logged": 1714024787236 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "from pathlib import Path\n", "from azure.ai.ml.entities import BuildContext, Environment\n", @@ -129,34 +114,43 @@ "llm_rag_embeddings_doc_intel_environment = ml_client.environments.get(\n", " name=\"llm_rag_embeddings_doc_intel\", version=\"6\"\n", ")" - ] - }, - { - "cell_type": "markdown", + ], + "outputs": [], + "execution_count": null, "metadata": { + "gather": { + "logged": 1714024787436 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, "nteract": { "transient": { "deleting": false } } - }, + } + }, + { + "cell_type": "markdown", "source": [ "Define the crack_and_chunk_with_doc_intel_component_parallel which can be used in place of the crack_and_chunk_parallel Component in Vector Index creation Pipelines.\n", "\n", "Please reference this article for parallel job setup of ML pipeline. https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-parallel-job-in-pipeline?view=azureml-api-2&tabs=python \n", "\n", "Please reference this article for setting up optimum parameters of parallel job https://microsoft.github.io/azureml-ops-accelerator/4-Migrate/3-PerformanceTunePRS.html" - ] + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714024787580 - } - }, - "outputs": [], "source": [ "crack_and_chunk_with_doc_intel_component_parallel = parallel_run_function(\n", " # version=\"0.0.1\",\n", @@ -245,26 +239,26 @@ " environment=llm_rag_embeddings_doc_intel_environment,\n", " ),\n", ")" - ] + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024787580 + } + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Define pipeline using the custom `crack_and_chunk_with_doc_intel_component_parallel along with the AzureML provided Components to embed and index your data.\n", "\n", "The llm_rag_generate_embeddings_parallel does parallel processing job." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714024794202 - } - }, - "outputs": [], "source": [ "ml_registry = MLClient(credential=ml_client._credential, registry_name=\"azureml\")\n", "\n", @@ -280,30 +274,30 @@ "register_mlindex_asset_component = ml_registry.components.get(\n", " \"llm_rag_register_mlindex_asset\", label=\"latest\"\n", ")" - ] + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024794202 + } + } }, { "cell_type": "markdown", + "source": [ + "Setup parallel parameters for component llm_rag_generate_embeddings_parallel." + ], "metadata": { "nteract": { "transient": { "deleting": false } } - }, - "source": [ - "Setup parallel parameters for component llm_rag_generate_embeddings_parallel." - ] + } }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714024794501 - } - }, - "outputs": [], "source": [ "from azure.ai.ml import Input, Output\n", "from azure.ai.ml.dsl import pipeline\n", @@ -401,32 +395,38 @@ " \"mlindex_asset_uri\": update_acs_index.outputs.index,\n", " \"mlindex_asset_id\": register_mlindex.outputs.asset_id,\n", " }" - ] + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024794501 + } + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Get the connections to Azure OpenAI (for embeddings with `text-embedding-ada-002`) and Azure Cognitive Search." - ] + ], + "metadata": {} }, { "cell_type": "code", + "source": [ + "aoai_connection = ml_client.connections.get(\"AOAI-westus\")\n", + "acs_connection = ml_client.connections.get(\"cog-serch-westus\")" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { "logged": 1714024794907 } - }, - "outputs": [], - "source": [ - "aoai_connection = ml_client.connections.get(\"AOAI-westus\")\n", - "acs_connection = ml_client.connections.get(\"cog-serch-westus\")" - ] + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Create a Custom Connection with details for an Azure AI Document Intelligence Service.\n", "[Setup instructions for Azure AI Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-3.1.0)\n", @@ -434,33 +434,27 @@ "Use the Connections UI in an AzureML Workspace, under the Promptflow tab, to create a connection with these fields: ![custom_doc_intel_connection.png](./assets/custom_doc_intel_connection.png)\n", "\n", "It's not yet supported to create/retrieve Custom Connections using SDK, so you will need to create it using the UI and we'll use string replacement below to get the ID for this custom connection to pass to our pipeline." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714024795053 - } - }, - "outputs": [], "source": [ "document_intelligence_connection_id = aoai_connection.id.replace(\n", " \"AOAI-westus\", \"doc-intelligence\"\n", ")\n", "document_intelligence_connection_id" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714024795190 + "logged": 1714024795053 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "import json\n", "from azure.ai.ml import Input\n", @@ -471,7 +465,7 @@ "\n", "data_source = \"wasbs://doc-intelligence-container@storageaccount.blob.core.windows.net\"\n", "# data_source = Path.cwd() / \"pdfs\"\n", - "asset_name = f\"beilei-index-parallel\"\n", + "asset_name = f\"my-index-parallel\"\n", "\n", "pipeline_job = uri_to_acs_parallel(\n", " input_data=Input(type=\"uri_folder\", path=str(data_source)),\n", @@ -501,17 +495,17 @@ "pipeline_job.properties[\"azureml.mlIndexAssetName\"] = asset_name\n", "pipeline_job.properties[\"azureml.mlIndexAssetKind\"] = \"acs\"\n", "pipeline_job.properties[\"azureml.mlIndexAssetSource\"] = \"AzureML Data\"" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714024800470 + "logged": 1714024795190 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "# submission_name = f\"embedding-parallel-no-auto\"\n", "submission_name = f\"embedding-parallel-auto1\"\n", @@ -521,42 +515,40 @@ ")\n", "\n", "print(f\"Submitted run, url: {running_pipeline_job.studio_url}\")" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714025810556 + "logged": 1714024800470 } - }, - "outputs": [], - "source": [ - "ml_client.jobs.stream(running_pipeline_job.name)" - ] + } }, { "cell_type": "code", + "source": [ + "ml_client.jobs.stream(running_pipeline_job.name)" + ], + "outputs": [], "execution_count": null, "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } + "gather": { + "logged": 1714025810556 } - }, - "outputs": [], - "source": [ - "%pip install azure-search-documents==11.4.0b8" - ] + } }, { "cell_type": "code", + "source": [ + "from promptflow.rag import get_langchain_retriever_from_index\n", + "\n", + "question = \"What are the employees benefits\"\n", + "\n", + "my_index = ml_client.data.get(name=asset_name, label=\"latest\")\n", + "index_langchain_retriever = get_langchain_retriever_from_index(my_index.path)\n", + "index_langchain_retriever.get_relevant_documents(question)" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { @@ -571,18 +563,7 @@ "deleting": false } } - }, - "outputs": [], - "source": [ - "from azureml.rag.mlindex import MLIndex\n", - "\n", - "question = \"What are the employees benefits\"\n", - "\n", - "retriever = MLIndex(\n", - " ml_client.data.get(asset_name, label=\"latest\")\n", - ").as_langchain_retriever()\n", - "retriever.get_relevant_documents(question)" - ] + } } ], "metadata": { @@ -598,21 +579,21 @@ "name": "python3" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "name": "python3", "language": "python", - "name": "python3" + "display_name": "Python 3 (ipykernel)" }, "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "nbconvert_exporter": "python", + "file_extension": ".py" }, "microsoft": { "host": { diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/f_parallel_crack_pdfs_with_azure_document_intelligence.ipynb b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/f_parallel_crack_pdfs_with_azure_document_intelligence.ipynb index 3a33e821a7..380cd4e2d6 100644 --- a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/f_parallel_crack_pdfs_with_azure_document_intelligence.ipynb +++ b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/f_parallel_crack_pdfs_with_azure_document_intelligence.ipynb @@ -2,7 +2,6 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "# Create an Index with custom cracking and chunking (in Parallel) and generate embeddings (in Parallel) using Azure Document Intelligence\n", "\n", @@ -15,10 +14,19 @@ "The difference from file e_parrale_crack_pdf_with_azure_document_intelligence.ipynb is that this one use DataAsset as input.\n", "\n", "The benefit of using DataAsset is that we can change files in the same blob container, while updating the DataAsset version, each version has a metadata pointing to the original blob container. This is good for testing re-indexing. " - ] + ], + "metadata": {} }, { "cell_type": "code", + "source": [ + "asset_name = f\"parallel-index-test\"\n", + "submission_name = f\"embedding-parallel-100\"\n", + "cpu_compute_target = \"cpu-cluster2\"\n", + "data_asset_name = \"Doc-intel-parallel-120\"\n", + "data_asset_version = \"2\"" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { @@ -33,33 +41,37 @@ "deleting": false } } - }, - "outputs": [], - "source": [ - "asset_name = f\"parallel-index-test\"\n", - "submission_name = f\"embedding-parallel-100\"\n", - "cpu_compute_target = \"cpu-cluster2\"\n", - "data_asset_name = \"Doc-intel-parallel-120\"\n", - "data_asset_version = \"2\"" - ] + } }, { "cell_type": "code", + "source": [ + "%pip install -U azure-ai-ml==1.16.0b1 # versions may change later\n", + "%pip install azure-identity\n", + "%pip install -U 'azureml-rag[azure,cognitive_search]>=0.2.28'\n", + "%pip install promptflow-rag==0.1.0" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { "logged": 1712725306438 } - }, - "outputs": [], - "source": [ - "%pip install -U azure-ai-ml>=1.10\n", - "%pip install azure-identity\n", - "%pip install -U 'azureml-rag[azure,cognitive_search]==0.2.28'" - ] + } }, { "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "from azure.ai.ml import Input, Output, command, load_component\n", + "from azure.ai.ml.entities import BuildContext, Environment\n", + "from azure.ai.ml.constants import AssetTypes, InputOutputModes\n", + "from azure.ai.ml.parallel import parallel_run_function, RunFunction\n", + "import pandas as pd\n", + "import os\n", + "from azureml.core import Workspace, Dataset, Datastore" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { @@ -74,28 +86,10 @@ "deleting": false } } - }, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "from azure.ai.ml import Input, Output, command, load_component\n", - "from azure.ai.ml.entities import BuildContext, Environment\n", - "from azure.ai.ml.constants import AssetTypes, InputOutputModes\n", - "from azure.ai.ml.parallel import parallel_run_function, RunFunction\n", - "import pandas as pd\n", - "import os\n", - "from azureml.core import Workspace, Dataset, Datastore" - ] + } }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "microsoft": { - "language": "plaintext" - } - }, - "outputs": [], "source": [ "%%writefile config.json\n", "{\n", @@ -103,17 +97,17 @@ " \"resource_group\": \"\",\n", " \"workspace_name\": \"\"\n", "}" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { - "gather": { - "logged": 1714023785220 + "microsoft": { + "language": "plaintext" } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", "from azure.ai.ml import MLClient\n", @@ -128,39 +122,28 @@ " credential = InteractiveBrowserCredential()\n", "\n", "ml_client = MLClient.from_config(credential=credential)" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714023785384 - }, - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } + "logged": 1714023785220 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "# data_source=ml_client.data.get(\"Doc-intel-parallel-120\", version=\"2\")\n", "data_source = ml_client.data.get(data_asset_name, version=data_asset_version)\n", "\n", "print(f\"Data asset URI: {data_source.path}\")" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714023785524 + "logged": 1714023785384 }, "jupyter": { "outputs_hidden": false, @@ -171,8 +154,10 @@ "deleting": false } } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "from pathlib import Path\n", "from azure.ai.ml.entities import BuildContext, Environment\n", @@ -187,34 +172,43 @@ "llm_rag_embeddings_doc_intel_environment = ml_client.environments.get(\n", " name=\"llm_rag_embeddings_doc_intel\", version=\"6\"\n", ")" - ] - }, - { - "cell_type": "markdown", + ], + "outputs": [], + "execution_count": null, "metadata": { + "gather": { + "logged": 1714023785524 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, "nteract": { "transient": { "deleting": false } } - }, + } + }, + { + "cell_type": "markdown", "source": [ "Define the crack_and_chunk_with_doc_intel_component_parallel which can be used in place of the crack_and_chunk_parallel Component in Vector Index creation Pipelines.\n", "\n", "Please reference this article for parallel job setup of ML pipeline. https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-parallel-job-in-pipeline?view=azureml-api-2&tabs=python \n", "\n", "Please reference this article for setting up optimum parameters of parallel job https://microsoft.github.io/azureml-ops-accelerator/4-Migrate/3-PerformanceTunePRS.html" - ] + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714023785626 - } - }, - "outputs": [], "source": [ "crack_and_chunk_with_doc_intel_component_parallel = parallel_run_function(\n", " # version=\"0.0.1\",\n", @@ -303,26 +297,26 @@ " environment=llm_rag_embeddings_doc_intel_environment,\n", " ),\n", ")" - ] + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714023785626 + } + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Define pipeline using the custom `crack_and_chunk_with_doc_intel_component_parallel along with the AzureML provided Components to embed and index your data.\n", "\n", "The llm_rag_generate_embeddings_parallel does parallel processing job." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714023787240 - } - }, - "outputs": [], "source": [ "ml_registry = MLClient(credential=ml_client._credential, registry_name=\"azureml\")\n", "\n", @@ -338,30 +332,30 @@ "register_mlindex_asset_component = ml_registry.components.get(\n", " \"llm_rag_register_mlindex_asset\", label=\"latest\"\n", ")" - ] + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714023787240 + } + } }, { "cell_type": "markdown", + "source": [ + "Setup parallel parameters for component llm_rag_generate_embeddings_parallel." + ], "metadata": { "nteract": { "transient": { "deleting": false } } - }, - "source": [ - "Setup parallel parameters for component llm_rag_generate_embeddings_parallel." - ] + } }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714023787551 - } - }, - "outputs": [], "source": [ "from azure.ai.ml import Input, Output\n", "from azure.ai.ml.dsl import pipeline\n", @@ -459,32 +453,38 @@ " \"mlindex_asset_uri\": update_acs_index.outputs.index,\n", " \"mlindex_asset_id\": register_mlindex.outputs.asset_id,\n", " }" - ] + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714023787551 + } + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Get the connections to Azure OpenAI (for embeddings with `text-embedding-ada-002`) and Azure Cognitive Search." - ] + ], + "metadata": {} }, { "cell_type": "code", + "source": [ + "aoai_connection = ml_client.connections.get(\"AOAI-westus\")\n", + "acs_connection = ml_client.connections.get(\"cog-serch-westus\")" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { "logged": 1714023787907 } - }, - "outputs": [], - "source": [ - "aoai_connection = ml_client.connections.get(\"AOAI-westus\")\n", - "acs_connection = ml_client.connections.get(\"cog-serch-westus\")" - ] + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Create a Custom Connection with details for an Azure AI Document Intelligence Service.\n", "[Setup instructions for Azure AI Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-3.1.0)\n", @@ -492,33 +492,27 @@ "Use the Connections UI in an AzureML Workspace, under the Promptflow tab, to create a connection with these fields: ![custom_doc_intel_connection.png](./assets/custom_doc_intel_connection.png)\n", "\n", "It's not yet supported to create/retrieve Custom Connections using SDK, so you will need to create it using the UI and we'll use string replacement below to get the ID for this custom connection to pass to our pipeline." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "gather": { - "logged": 1714023788139 - } - }, - "outputs": [], "source": [ "document_intelligence_connection_id = aoai_connection.id.replace(\n", " \"AOAI-westus\", \"doc-intelligence\"\n", ")\n", "document_intelligence_connection_id" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714023788283 + "logged": 1714023788139 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "import json\n", "from azure.ai.ml import Input\n", @@ -555,17 +549,17 @@ "pipeline_job.properties[\"azureml.mlIndexAssetName\"] = asset_name\n", "pipeline_job.properties[\"azureml.mlIndexAssetKind\"] = \"acs\"\n", "pipeline_job.properties[\"azureml.mlIndexAssetSource\"] = \"AzureML Data\"" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714023793511 + "logged": 1714023788283 } - }, - "outputs": [], + } + }, + { + "cell_type": "code", "source": [ "# submission_name = f\"embedding-parallel-no-auto\"\n", "# submission_name = f\"embedding-parallel-auto\"\n", @@ -575,42 +569,40 @@ ")\n", "\n", "print(f\"Submitted run, url: {running_pipeline_job.studio_url}\")" - ] - }, - { - "cell_type": "code", + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { - "logged": 1714024712464 + "logged": 1714023793511 } - }, - "outputs": [], - "source": [ - "ml_client.jobs.stream(running_pipeline_job.name)" - ] + } }, { "cell_type": "code", + "source": [ + "ml_client.jobs.stream(running_pipeline_job.name)" + ], + "outputs": [], "execution_count": null, "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "nteract": { - "transient": { - "deleting": false - } + "gather": { + "logged": 1714024712464 } - }, - "outputs": [], - "source": [ - "%pip install azure-search-documents==11.4.0b8" - ] + } }, { "cell_type": "code", + "source": [ + "from promptflow.rag import get_langchain_retriever_from_index\n", + "\n", + "question = \"What are the employees benefits\"\n", + "\n", + "my_index = ml_client.data.get(name=asset_name, label=\"latest\")\n", + "index_langchain_retriever = get_langchain_retriever_from_index(my_index.path)\n", + "index_langchain_retriever.get_relevant_documents(question)" + ], + "outputs": [], "execution_count": null, "metadata": { "gather": { @@ -625,18 +617,7 @@ "deleting": false } } - }, - "outputs": [], - "source": [ - "from azureml.rag.mlindex import MLIndex\n", - "\n", - "question = \"What are the employees benefits\"\n", - "\n", - "retriever = MLIndex(\n", - " ml_client.data.get(asset_name, label=\"latest\")\n", - ").as_langchain_retriever()\n", - "retriever.get_relevant_documents(question)" - ] + } } ], "metadata": { @@ -652,21 +633,21 @@ "name": "python3" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "name": "python3", "language": "python", - "name": "python3" + "display_name": "Python 3 (ipykernel)" }, "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "nbconvert_exporter": "python", + "file_extension": ".py" }, "microsoft": { "host": { diff --git a/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/local_parallel_crack_pdfs_with_azure_document_intelligence.ipynb b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/local_parallel_crack_pdfs_with_azure_document_intelligence.ipynb new file mode 100644 index 0000000000..bcbf6ceca6 --- /dev/null +++ b/sdk/python/generative-ai/rag/notebooks/custom_crack_and_chunk_parallel/local_parallel_crack_pdfs_with_azure_document_intelligence.ipynb @@ -0,0 +1,623 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Create an Index with custom cracking and chunking (in Parallel) and generate embeddings (in Parallel) using Azure Document Intelligence\n", + "\n", + "Create an index with custom cracking and chunking using the Azure Document Intelligence aka Azure Form Recognizer.\n", + "\n", + "This notebook has two components do parallel processing:\n", + "- crack_and_chunk_with_doc_intel_parallel. This component is created for custom crack_and_chunk. It runs on a compute cluster with multiple nodes and each has multiple processes, the input files are divided into mini batches and assigned to each process. All processes do the crack_and_chunk in parallel.\n", + "- generate_embeddings_parallel. This is a build-in component provided by Azure Designer. User can set parallel parameters, like instance count, max_concurrency_per_instance, mini_batch_size, etc.\n", + "\n", + "This notebook is almost the same as e_parallel_crack_pdfs_with_azure_document_intelligence.ipynb, the difference is that this notebook can be run locally, i.e. VS code, and all pdf files are stored in local folder (window example)." + ], + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "!az login --tenant \"XXXXX\"" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "%pip install -U azure-ai-ml==1.16.0b1 # versions may change later\n", + "%pip install azure-identity\n", + "%pip install -U 'azureml-rag[azure,cognitive_search]>=0.2.28'\n", + "%pip install promptflow-rag==0.1.0" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1712725306438 + } + } + }, + { + "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "from azure.ai.ml import Input, Output, command, load_component\n", + "from azure.ai.ml.entities import BuildContext, Environment\n", + "from azure.ai.ml.constants import AssetTypes, InputOutputModes\n", + "from azure.ai.ml.parallel import parallel_run_function, RunFunction" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024786990 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "%%writefile config.json\n", + "{\n", + " \"subscription_id\": \"\",\n", + " \"resource_group\": \"\",\n", + " \"workspace_name\": \"\"\n", + "}" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "microsoft": { + "language": "plaintext" + } + } + }, + { + "cell_type": "code", + "source": [ + "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n", + "from azure.ai.ml import MLClient\n", + "\n", + "identity = None\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " # Check if given credential can get token successfully.\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "ml_client = MLClient.from_config(credential=credential)" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024787236 + } + } + }, + { + "cell_type": "code", + "source": [ + "from pathlib import Path\n", + "from azure.ai.ml.entities import BuildContext, Environment\n", + "\n", + "\"\"\"\n", + "llm_rag_embeddings_doc_intel_environment = Environment(\n", + " name=\"llm_rag_embeddings_doc_intel\",\n", + " description=\"AzureML RAGs base crack_and_chunk environment with azure-ai-formrecognizer installed.\",\n", + " build=BuildContext(path=Path.cwd() / \"doc_intel_env\"),\n", + ")\n", + "\n", + "\"\"\"\n", + "llm_rag_embeddings_doc_intel_environment = ml_client.environments.get(\n", + " name=\"llm_rag_embeddings_doc_intel\", version=\"6\"\n", + ")" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024787436 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Define the crack_and_chunk_with_doc_intel_component_parallel which can be used in place of the crack_and_chunk_parallel Component in Vector Index creation Pipelines.\n", + "\n", + "Please reference this article for parallel job setup of ML pipeline. https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-parallel-job-in-pipeline?view=azureml-api-2&tabs=python \n", + "\n", + "Please reference this article for setting up optimum parameters of parallel job https://microsoft.github.io/azureml-ops-accelerator/4-Migrate/3-PerformanceTunePRS.html" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "crack_and_chunk_with_doc_intel_component_parallel = parallel_run_function(\n", + " # version=\"0.0.1\",\n", + " name=\"crack_and_chunk_with_doc_intel_parallel\",\n", + " display_name=\"Crack and Chunk Data leveraging Azure AI Document Intelligence for PDFs in parallel\",\n", + " description=\"\"\"Creates chunks from source data leveraging Azure AI Document Intelligence for PDFs in parallel.\n", + "\n", + " Supported formats: md, txt, html/htm, pdf, ppt(x), doc(x), xls(x), py\"\"\",\n", + " inputs={\n", + " # Input AzureML Data\n", + " \"input_data\": Input(type=\"uri_folder\", mode=\"ro_mount\"),\n", + " # Files to handle from source\n", + " \"input_glob\": Input(\n", + " type=\"string\",\n", + " default=\"/**/*\",\n", + " description=\"Limit files opened from `input_data`, defaults to '**/*'\",\n", + " ),\n", + " \"allowed_extensions\": Input(\n", + " type=\"string\",\n", + " optional=True,\n", + " description=\"Comma separated list of extensions to include, if not provided the default list of supported extensions will be used. e.g. '.md,.txt,.html,.py,.pdf'\",\n", + " ),\n", + " # Chunking options\n", + " \"chunk_size\": Input(\n", + " type=\"integer\",\n", + " default=768,\n", + " description=\"Maximum number of tokens per chunk.\",\n", + " ),\n", + " \"chunk_overlap\": Input(\n", + " type=\"integer\",\n", + " default=0,\n", + " description=\"Number of tokens to overlap between chunks.\",\n", + " ),\n", + " \"use_rcts\": Input(\n", + " type=\"boolean\",\n", + " default=True,\n", + " description=\"Use langchain RecursiveTextSplitter to split chunks.\",\n", + " ),\n", + " # Augmentation options\n", + " \"data_source_url\": Input(\n", + " type=\"string\",\n", + " optional=True,\n", + " description=\"Base URL to join with file paths to create full source file URL for chunk metadata.\",\n", + " ),\n", + " \"document_path_replacement_regex\": Input(\n", + " type=\"string\",\n", + " optional=True,\n", + " description=\"A JSON string with two fields, 'match_pattern' and 'replacement_pattern' to be used with re.sub on the source url. e.g. '{\\\"match_pattern\\\": \\\"(.*)/articles/(.*)\\\", \\\"replacement_pattern\\\": \\\"\\\\1/\\\\2\\\"}' would remove '/articles' from the middle of the url.\",\n", + " ),\n", + " \"doc_intel_connection_id\": Input(\n", + " type=\"string\",\n", + " description=\"AzureML Connection ID for Custom Workspace Connection containing the `endpoint` key and `api_key` secret for an Azure AI Document Intelligence Service.\",\n", + " ),\n", + " \"use_layout\": Input(\n", + " type=\"boolean\",\n", + " default=True,\n", + " description=\"Use 'prebuilt-layout' model from Azure AI Document Intelligence, more expensive and slower but maintains more structure from original doc.\",\n", + " ),\n", + " },\n", + " outputs={\n", + " \"output_chunks\": Output(type=\"uri_folder\", mode=\"rw_mount\"),\n", + " },\n", + " input_data=\"${{inputs.input_data}}\",\n", + " instance_count=4,\n", + " max_concurrency_per_instance=4,\n", + " mini_batch_size=\"1\",\n", + " mini_batch_error_threshold=-1,\n", + " item_error_treshold=-1,\n", + " retry_settings=dict(max_retries=2, timeout=1200),\n", + " progress_update_timeout=259200,\n", + " logging_level=\"DEBUG\",\n", + " task=RunFunction(\n", + " code=\"c:\\\\Users\\\\username\\\\custom_crack_and_chunk_parallel\\\\crack_and_chunk_with_doc_intel\", # local folder where the entry script resides.\n", + " entry_script=\"crack_and_chunk_parallel.py\",\n", + " program_arguments=\"--input_data ${{inputs.input_data}}\\\n", + " --input_glob '${{inputs.input_glob}}'\\\n", + " $[[--allowed_extensions ${{inputs.allowed_extensions}}]]\\\n", + " --output_chunks ${{outputs.output_chunks}}\\\n", + " --chunk_size ${{inputs.chunk_size}}\\\n", + " --chunk_overlap ${{inputs.chunk_overlap}}\\\n", + " --use_rcts ${{inputs.use_rcts}}\\\n", + " $[[--data_source_url ${{inputs.data_source_url}}]]\\\n", + " $[[--document_path_replacement_regex '${{inputs.document_path_replacement_regex}}']]\\\n", + " --doc_intel_connection_id '${{inputs.doc_intel_connection_id}}'\\\n", + " --use_layout ${{inputs.use_layout}}\\ \",\n", + " environment=llm_rag_embeddings_doc_intel_environment,\n", + " ),\n", + ")" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024787580 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Define pipeline using the custom `crack_and_chunk_with_doc_intel_component_parallel along with the AzureML provided Components to embed and index your data.\n", + "\n", + "The llm_rag_generate_embeddings_parallel does parallel processing job." + ], + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "ml_registry = MLClient(credential=ml_client._credential, registry_name=\"azureml\")\n", + "\n", + "# Reads input folder of files containing chunks and their metadata as batches, in parallel, and generates embeddings for each chunk. Output format is produced and loaded by `azureml.rag.embeddings.EmbeddingContainer`.\n", + "generate_embeddings_component_parallel = ml_registry.components.get(\n", + " \"llm_rag_generate_embeddings_parallel\", label=\"latest\"\n", + ")\n", + "# Reads an input folder produced by `azureml.rag.embeddings.EmbeddingsContainer.save()` and pushes all documents (chunk, metadata, embedding_vector) into an Azure Cognitive Search index. Writes an MLIndex yaml detailing the index and embeddings model information.\n", + "update_acs_index_component = ml_registry.components.get(\n", + " \"llm_rag_update_acs_index\", label=\"latest\"\n", + ")\n", + "# Takes a uri to a storage location where an MLIndex yaml is stored and registers it as an MLIndex Data asset in the AzureML Workspace.\n", + "register_mlindex_asset_component = ml_registry.components.get(\n", + " \"llm_rag_register_mlindex_asset\", label=\"latest\"\n", + ")" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024794202 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Setup parallel parameters for component llm_rag_generate_embeddings_parallel." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "from azure.ai.ml import Input, Output\n", + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml.entities._job.pipeline._io import PipelineInput\n", + "\n", + "# Retrieve an already attached Azure Machine Learning Compute.\n", + "cpu_compute_target = \"cpu-cluster2\"\n", + "\n", + "\n", + "def use_automatic_compute(component):\n", + " component.set_resources(\n", + " instance_count=4,\n", + " max_concurrency_per_instance=4,\n", + " mini_batch_size=1,\n", + " mini_batch_error_threshold=-1,\n", + " retry_settings=dict(max_retries=2, timeout=1200),\n", + " item_error_treshold=-1,\n", + " progress_update_timeout=259200,\n", + " properties={\"compute_specification\": {\"automatic\": True}},\n", + " logging_level=\"DEBUG\",\n", + " )\n", + " return component\n", + "\n", + "\n", + "def optional_pipeline_input_provided(input: PipelineInput):\n", + " return input._data is not None\n", + "\n", + "\n", + "@pipeline(default_compute=cpu_compute_target)\n", + "def uri_to_acs_parallel(\n", + " input_data: Input,\n", + " doc_intel_connection_id: str,\n", + " embeddings_model: str,\n", + " acs_config: str,\n", + " acs_connection_id: str,\n", + " asset_name: str,\n", + " chunk_size: int = 1024,\n", + " data_source_glob: str = \"*pdf\",\n", + " data_source_url: str = None,\n", + " document_path_replacement_regex: str = None,\n", + " use_layout: bool = True,\n", + " aoai_connection_id: str = None,\n", + " embeddings_container: Input = None,\n", + "):\n", + " crack_and_chunk = crack_and_chunk_with_doc_intel_component_parallel(\n", + " input_data=input_data,\n", + " input_glob=data_source_glob,\n", + " chunk_size=chunk_size,\n", + " use_rcts=True,\n", + " data_source_url=data_source_url,\n", + " document_path_replacement_regex=document_path_replacement_regex,\n", + " doc_intel_connection_id=doc_intel_connection_id,\n", + " use_layout=True,\n", + " )\n", + "\n", + " crack_and_chunk.inputs.input_data.mode = InputOutputModes.DOWNLOAD\n", + "\n", + " generate_embeddings_parallel = generate_embeddings_component_parallel(\n", + " chunks_source=crack_and_chunk.outputs.output_chunks,\n", + " embeddings_container=embeddings_container,\n", + " embeddings_model=embeddings_model,\n", + " )\n", + "\n", + " generate_embeddings_parallel.inputs.embeddings_container.mode = (\n", + " InputOutputModes.DOWNLOAD\n", + " )\n", + "\n", + " use_automatic_compute(generate_embeddings_parallel)\n", + "\n", + " if optional_pipeline_input_provided(aoai_connection_id):\n", + " generate_embeddings_parallel.environment_variables[\n", + " \"AZUREML_WORKSPACE_CONNECTION_ID_AOAI\"\n", + " ] = aoai_connection_id\n", + " if optional_pipeline_input_provided(embeddings_container):\n", + " # If provided, previous_embeddings is expected to be a URI to an 'embeddings container' folder.\n", + " # Each folder under this folder is generated by a `generate_embeddings_component_parallel` run and can be reused for subsequent embeddings runs.\n", + " generate_embeddings_parallel.outputs.embeddings = Output(\n", + " type=\"uri_folder\", path=f\"{embeddings_container.path}/{{name}}\"\n", + " )\n", + "\n", + " update_acs_index = update_acs_index_component(\n", + " embeddings=generate_embeddings_parallel.outputs.embeddings,\n", + " acs_config=acs_config,\n", + " )\n", + " if optional_pipeline_input_provided(acs_connection_id):\n", + " update_acs_index.environment_variables[\n", + " \"AZUREML_WORKSPACE_CONNECTION_ID_ACS\"\n", + " ] = acs_connection_id\n", + "\n", + " register_mlindex = register_mlindex_asset_component(\n", + " storage_uri=update_acs_index.outputs.index,\n", + " asset_name=asset_name,\n", + " )\n", + " return {\n", + " \"mlindex_asset_uri\": update_acs_index.outputs.index,\n", + " \"mlindex_asset_id\": register_mlindex.outputs.asset_id,\n", + " }" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024794501 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Get the connections to Azure OpenAI (for embeddings with `text-embedding-ada-002`) and Azure Cognitive Search." + ], + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "aoai_connection = ml_client.connections.get(\"AOAI-westus\")\n", + "acs_connection = ml_client.connections.get(\"cog-serch-westus\")" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024794907 + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Create a Custom Connection with details for an Azure AI Document Intelligence Service.\n", + "[Setup instructions for Azure AI Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-3.1.0)\n", + "\n", + "Use the Connections UI in an AzureML Workspace, under the Promptflow tab, to create a connection with these fields: ![custom_doc_intel_connection.png](./assets/custom_doc_intel_connection.png)\n", + "\n", + "It's not yet supported to create/retrieve Custom Connections using SDK, so you will need to create it using the UI and we'll use string replacement below to get the ID for this custom connection to pass to our pipeline." + ], + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "document_intelligence_connection_id = aoai_connection.id.replace(\n", + " \"AOAI-westus\", \"doc-intelligence\"\n", + ")\n", + "document_intelligence_connection_id" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024795053 + } + } + }, + { + "cell_type": "code", + "source": [ + "import json\n", + "from azure.ai.ml import Input\n", + "\n", + "embeddings_model = (\n", + " \"azure_open_ai://deployment/text-embedding-ada-002/model/text-embedding-ada-002\"\n", + ")\n", + "\n", + "data_source = \"C:\\\\Users\\\\username\\\\Downloads\\\\pdf-files\" # This is the local folder where stores all pdf files.\n", + "asset_name = f\"my-index-parallel\"\n", + "\n", + "pipeline_job = uri_to_acs_parallel(\n", + " input_data=Input(type=\"uri_folder\", path=str(data_source)),\n", + " # data_source_glob=\"**/*\",\n", + " data_source_glob=\"*pdf\",\n", + " data_source_url=None,\n", + " document_path_replacement_regex=None,\n", + " doc_intel_connection_id=document_intelligence_connection_id,\n", + " use_layout=True,\n", + " embeddings_model=embeddings_model,\n", + " aoai_connection_id=aoai_connection.id,\n", + " embeddings_container=Input(\n", + " type=\"uri_folder\",\n", + " path=f\"azureml://datastores/workspaceblobstore/paths/embeddings/{asset_name}\",\n", + " ),\n", + " acs_config=json.dumps(\n", + " {\n", + " \"index_name\": asset_name,\n", + " }\n", + " ),\n", + " acs_connection_id=acs_connection.id,\n", + ")\n", + "pipeline_job.display_name = asset_name\n", + "\n", + "# Properties for Vector Index UI\n", + "pipeline_job.properties[\"azureml.mlIndexAssetName\"] = asset_name\n", + "pipeline_job.properties[\"azureml.mlIndexAssetKind\"] = \"acs\"\n", + "pipeline_job.properties[\"azureml.mlIndexAssetSource\"] = \"AzureML Data\"" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024795190 + } + } + }, + { + "cell_type": "code", + "source": [ + "# submission_name = f\"embedding-parallel-no-auto\"\n", + "submission_name = f\"embedding-parallel-local\"\n", + "print(f\"Submitting pipeline job to experiment: {submission_name}\")\n", + "running_pipeline_job = ml_client.jobs.create_or_update(\n", + " pipeline_job, experiment_name=submission_name\n", + ")" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714024800470 + } + } + }, + { + "cell_type": "code", + "source": [ + "ml_client.jobs.stream(running_pipeline_job.name)" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714025810556 + } + } + }, + { + "cell_type": "code", + "source": [ + "from promptflow.rag import get_langchain_retriever_from_index\n", + "\n", + "question = \"What are the employees benefits\"\n", + "\n", + "my_index = ml_client.data.get(name=asset_name, label=\"latest\")\n", + "index_langchain_retriever = get_langchain_retriever_from_index(my_index.path)\n", + "index_langchain_retriever.get_relevant_documents(question)" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "gather": { + "logged": 1714025816935 + }, + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + } + } + ], + "metadata": { + "categories": [ + "SDK v2", + "sdk", + "python", + "generative-ai", + "rag", + "notebooks" + ], + "kernel_info": { + "name": "python3" + }, + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)" + }, + "language_info": { + "name": "python", + "version": "3.8.5", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "microsoft": { + "host": { + "AzureML": { + "notebookHasBeenCompleted": true + } + }, + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}