# Create component of crack_and_chunk_with_doc_intel_parallel


In [None]:
%pip install -U azure-ai-ml>=1.10
%pip install azure-identity
%pip install -U 'azureml-rag[azure,cognitive_search]==0.2.28'

In [None]:
from azure.ai.ml import Input, Output
from azure.ai.ml.entities import Environment
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.ai.ml.parallel import parallel_run_function, RunFunction
from azureml.core import Workspace

In [None]:
%%writefile config.json
{
    "subscription_id": "<subscription id>",
    "resource_group": "<resource_group>",
    "workspace_name": "<workspace_name>"
}

In [None]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

identity = None
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

ml_client = MLClient.from_config(credential=credential)

In [None]:
from pathlib import Path
from azure.ai.ml.entities import BuildContext, Environment

"""
llm_rag_embeddings_doc_intel_environment = Environment(
    name="llm_rag_embeddings_doc_intel",
    description="AzureML RAGs base crack_and_chunk environment with azure-ai-formrecognizer installed.",
    build=BuildContext(path=Path.cwd() / "doc_intel_env"),
)
"""
llm_rag_embeddings_doc_intel_environment = ml_client.environments.get(
    name="llm_rag_embeddings_doc_intel", version="6"
)

Define the crack_and_chunk_with_doc_intel_component_parallel which can be used in place of the crack_and_chunk_parallel Component in Vector Index creation Pipelines.

Please reference this article for parallel job setup of ML pipeline. https://learn.microsoft.com/en-us/azure/machine-learning/how-to-use-parallel-job-in-pipeline?view=azureml-api-2&tabs=python

Get the connections to Azure OpenAI (for embeddings with `text-embedding-ada-002`) and Azure Cognitive Search.

In [None]:
aoai_connection = ml_client.connections.get("AOAI-westus")
acs_connection = ml_client.connections.get("cog-serch-westus")

Create a Custom Connection with details for an Azure AI Document Intelligence Service.
[Setup instructions for Azure AI Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-3.1.0)

Use the Connections UI in an AzureML Workspace, under the Promptflow tab, to create a connection with these fields: ![custom_doc_intel_connection.png](./assets/custom_doc_intel_connection.png)

It's not yet supported to create/retrieve Custom Connections using SDK, so you will need to create it using the UI and we'll use string replacement below to get the ID for this custom connection to pass to our pipeline.

In [None]:
document_intelligence_connection_id = aoai_connection.id.replace(
    "AOAI-westus", "doc-intelligence"
)
document_intelligence_connection_id

Please reference this article for setting up optimum parameters of parallel job https://microsoft.github.io/azureml-ops-accelerator/4-Migrate/3-PerformanceTunePRS.html

In [None]:
crack_and_chunk_with_doc_intel_component_parallel = parallel_run_function(
    version="0.0.1",
    name="crack_and_chunk_with_doc_intel_parallel",
    display_name="crack_and_chunk_with_doc_intel_parallel",
    description="""Creates chunks from source data leveraging Azure AI Document Intelligence for PDFs in parallel.

    Supported formats: md, txt, html/htm, pdf, ppt(x), doc(x), xls(x), py""",
    inputs={
        # Input AzureML Data
        "input_data": Input(type="uri_folder", mode="ro_mount"),
        # Files to handle from source
        "input_glob": Input(
            type="string",
            default="/**/*",
            description="Limit files opened from `input_data`, defaults to '**/*'",
        ),
        "allowed_extensions": Input(
            type="string",
            optional=True,
            description="Comma separated list of extensions to include, if not provided the default list of supported extensions will be used. e.g. '.md,.txt,.html,.py,.pdf'",
        ),
        # Chunking options
        "chunk_size": Input(
            type="integer",
            default=768,
            description="Maximum number of tokens per chunk.",
        ),
        "chunk_overlap": Input(
            type="integer",
            default=0,
            description="Number of tokens to overlap between chunks.",
        ),
        "use_rcts": Input(
            type="boolean",
            default=True,
            description="Use langchain RecursiveTextSplitter to split chunks.",
        ),
        # Augmentation options
        "data_source_url": Input(
            type="string",
            optional=True,
            description="Base URL to join with file paths to create full source file URL for chunk metadata.",
        ),
        "document_path_replacement_regex": Input(
            type="string",
            optional=True,
            description="A JSON string with two fields, 'match_pattern' and 'replacement_pattern' to be used with re.sub on the source url. e.g. '{\"match_pattern\": \"(.*)/articles/(.*)\", \"replacement_pattern\": \"\\1/\\2\"}' would remove '/articles' from the middle of the url.",
        ),
        "doc_intel_connection_id": Input(
            type="string",
            default=document_intelligence_connection_id,
            description="AzureML Connection ID for Custom Workspace Connection containing the `endpoint` key and `api_key` secret for an Azure AI Document Intelligence Service.",
        ),
        "use_layout": Input(
            type="boolean",
            default=True,
            description="Use 'prebuilt-layout' model from Azure AI Document Intelligence, more expensive and slower but maintains more structure from original doc.",
        ),
    },
    outputs={
        "output_chunks": Output(type="uri_folder", mode="rw_mount"),
    },
    input_data="${{inputs.input_data}}",
    instance_count=4,
    max_concurrency_per_instance=4,
    mini_batch_size="1",
    mini_batch_error_threshold=-1,
    item_error_treshold=-1,
    retry_settings=dict(max_retries=2, timeout=1200),
    progress_update_timeout=259200,
    logging_level="DEBUG",
    task=RunFunction(
        code=Path.cwd() / "crack_and_chunk_with_doc_intel",
        entry_script="crack_and_chunk_parallel.py",
        program_arguments="--input_data ${{inputs.input_data}}\
        --input_glob '${{inputs.input_glob}}'\
        $[[--allowed_extensions ${{inputs.allowed_extensions}}]]\
        --output_chunks ${{outputs.output_chunks}}\
        --chunk_size ${{inputs.chunk_size}}\
        --chunk_overlap ${{inputs.chunk_overlap}}\
        --use_rcts ${{inputs.use_rcts}}\
        $[[--data_source_url ${{inputs.data_source_url}}]]\
        $[[--document_path_replacement_regex '${{inputs.document_path_replacement_regex}}']]\
        --doc_intel_connection_id '${{inputs.doc_intel_connection_id}}'\
        --use_layout ${{inputs.use_layout}}\ ",
        environment=llm_rag_embeddings_doc_intel_environment,
    ),
)

In [None]:
# Now we register the component to the workspace
crack_and_chunk_workspace = ml_client.create_or_update(
    crack_and_chunk_with_doc_intel_component_parallel.component
)

# Create (register) the component in your workspace
print(
    f"Component {crack_and_chunk_workspace.name} with Version {crack_and_chunk_workspace.version} is registered"
)