In [None]:
# import required libraries
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input, Output
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.automl import spark

In [None]:
try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

In [None]:
# Get a handle to workspace
ml_client = MLClient.from_config(credential=credential)

# Retrieve an already attached Azure Machine Learning Compute.
synapse_compute = "synapse-compute"
print(ml_client.compute.get(synapse_compute))

In [None]:
# Define pipeline
@pipeline(description="AutoML spark Pipeline",)
def automl_spark_job(input_data):
    # define the automl spark task with automl function
    spark_node = spark(
        code="./src",
        entry="program.py",
        py_files=[],
        jars=[],
        files=[],
        archives=[],
        conf=dict({
            "spark.driver.cores": "1",
            "spark.driver.memory": "2g",
            "spark.executor.cores": "2",
            "spark.exeutor.memory": "2g",
            "spark.executor.instances": "4"
        }),
        inputs=dict(input_folder=input_data, file_name="input_data.csv"),
        outputs=dict(output_folder=Output(type="uri_folder")),
    )

    return {"output_file": spark_node.outputs.output_folder}


pipeline = automl_spark_job(input_data=Input(path="./data-folder/", type="mltable"))
pipeline.settings.default_compute = synapse_compute

In [None]:
# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(pipeline, experiment_name="spark-pipeline")
pipeline_job