In [None]:
# import required libraries
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input, Output
from azure.ai.ml import spark
from azure.ai.ml.dsl import pipeline

In [None]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

In [None]:
# Get a handle to workspace
ml_client = MLClient.from_config(credential=credential)

# Retrieve an already attached Azure Machine Learning Compute.
synapse_compute = "synapse-compute"
print(ml_client.compute.get(synapse_compute))

In [None]:
from azure.ai.ml import ManagedIdentity

# Define pipeline
@pipeline(description="Spark pipeline job")
def spark_pipeline_job(input_data, output_data):
    # define the spark task
    spark_node = spark(
        code="./src",
        entry="program.py",
        py_files=['my_python.py'],
        jars=['my_java.jar'],
        files=['my_file.txt'],
        archives=['my_archive.tar'],
        driver_cores=1,
        driver_memory="2g",
        executor_cores=2,
        exeutor_memory="2g",
        executor_instances=4,
        conf={
            # Spark config key/value pairs (optional)
            "spark.jars.packages": "<group>:<artifact>:<version>,com.microsoft.ml.spark:mmlspark_2.11:0.15",
            "spark.jars.repositories": "<repo links>,\"https://mmlspark.azureedge.net/maven\"",
            "spark.jars.excludes": "<group>:<artifact>,slf4j:slf4j"
        },
        environment='myenv:v1',
        # Alternatively, provide 'conda_file' for environment
        # environment={"conda_file": "./conda.config"},
        inputs=dict(input_folder=input_data),
        outputs=dict(output_folder=output_data),
        args="--myinput ${{inputs.input_folder}} --myoutput ${{outputs.output_folder}}",
        identity=ManagedIdentity(client_id="xyz"),
        compute=synapse_compute,
        # For HOBO spark, provide 'resources'
        # resources={"instance_type": "Standard_E8S_V3", "runtime_version": "3.1.0"}
    )
    return {"output_folder": spark_node.outputs.output_folder}


pipeline = spark_pipeline_job(
    input_data=Input(path="azureml:mytable2:3.0", type="mltable", mode="direct"),
    output_data=Output(type="uri_folder", mode="direct", path="wasbs://mycontainer@mystorageaccount.blob.core.windows.net/path/to/folder"))

In [None]:
from azure.ai.ml import load_component

# Define pipeline using yaml
@pipeline(description="Spark pipeline job",)
def spark_pipeline_job_from_yaml(input_data):
    # define the spark task
    spark_node = load_component('spark-component.yaml')(input_folder=input_data)
    return {"output_folder": spark_node.outputs.output_folder}


pipeline = spark_pipeline_job_from_yaml(input_data=Input(path="azureml:mytable2:3.0", type="mltable", mode="direct"))

In [None]:
# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(pipeline, experiment_name="spark-pipeline")
pipeline_job