# Install packages

In [None]:
pip install -U "azureml-core<0.1.10" --index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/master/588E708E0DF342C4A80BD954289657CF --extra-index-url https://pypi.python.org/simple

In [None]:
pip install -U "azureml-pipeline-core<0.1.10" --index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/master/588E708E0DF342C4A80BD954289657CF --extra-index-url https://pypi.python.org/simple

In [None]:
pip install -U "azureml-pipeline-steps<0.1.10" --index-url https://azuremlsdktestpypi.azureedge.net/sdk-release/master/588E708E0DF342C4A80BD954289657CF --extra-index-url https://pypi.python.org/simple

# Prepare your AML workspace

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

# Link Synapse workspace to AML 

In [None]:
from azureml.core import LinkedService, SynapseWorkspaceLinkedServiceConfiguration
synapse_link_config = SynapseWorkspaceLinkedServiceConfiguration(
    subscription_id="4faaaf21-663f-4391-96fd-47197c630979",
    resource_group="static_resources_synapse_test",
    name="synapsepetesting"
)

linked_service = LinkedService.register(
    workspace=ws,
    name='synapselinkservice',
    linked_service_config=synapse_link_config)

In [None]:
linked_service

In [None]:
ws.linked_services

# View all the linked services

There is a MSI (system_assigned_identity_principal_id) created for each linked service, for example:

name=synapselink,</p>
type=Synapse, </p>
linked_service_resource_id=/subscriptions/4faaaf21-663f-4391-96fd-47197c630979/resourceGroups/static_resources_synapse_test/providers/Microsoft.Synapse/workspaces/synapsetest2, </p>
system_assigned_identity_principal_id=eb355d52-3806-4c5a-aec9-91447e8cfc2e </p>

#### Make sure you grant spark admin role of the synapse workspace to MSI in synapse studio before you submit job.

In [None]:
LinkedService.list(ws)

# Attach Synapse spark pool as AML compute target

In [None]:
from azureml.core.compute import SynapseCompute, ComputeTarget
spark_pool_name = "sparkpool1"
attached_synapse_name = "synapsecompute"

attach_config = SynapseCompute.attach_configuration(
        linked_service,
        type="SynapseSpark",
        pool_name=spark_pool_name)

synapse_compute=ComputeTarget.attach(
        workspace=ws,
        name=attached_synapse_name,
        attach_configuration=attach_config)

synapse_compute.wait_for_completion()

# Start an experiment run

In [None]:
from azureml.core import Dataset
from azureml.data.dataset_factory import DataType

dataset_name="blob_ds"
try:
    dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)
    print('Found existing dataset, use it.')
except:
    # create a TabularDataset from a delimited file behind a public web url and convert column "Survived" to boolean
    web_path ='https://dprepdata.blob.core.windows.net/demo/Titanic.csv'
    titanic_ds = Dataset.Tabular.from_delimited_files(path=web_path, set_column_types={'Survived': DataType.to_bool()})
    titanic_ds.register(ws,name=dataset_name)

In [None]:
from azureml.data import HDFSOutputDatasetConfig
output = HDFSOutputDatasetConfig(
    "synapse_step_output",
    destination=(ws.datastores['workspaceblobstore'],"test2")).register_on_complete(name="registered_dataset")

In [None]:
from azureml.core import RunConfiguration, Experiment

run_config = RunConfiguration(framework="pyspark")
run_config.output_data = {output.name: output}

run_config.target = attached_synapse_name

run_config.spark.configuration["spark.driver.memory"] = "1g" 
run_config.spark.configuration["spark.driver.cores"] = 2 
run_config.spark.configuration["spark.executor.memory"] = "1g" 
run_config.spark.configuration["spark.executor.cores"] = 1 
run_config.spark.configuration["spark.executor.instances"] = 1 

from azureml.core import ScriptRunConfig 

script_run_config = ScriptRunConfig(source_directory = './script', 
                                    script= 'pyspark_job_exp.py', 
                                    arguments = ['args1','args2'], 
                                    run_config = run_config) 

In [None]:
from azureml.core import Experiment 
exp = Experiment(workspace=ws, name="synapse-spark") 
run = exp.submit(config=script_run_config) 
run

# Start Pipeline run

In [None]:
from azureml.core import Workspace, Experiment, Dataset, Environment,Datastore
ws = Workspace.get(name='ws_canary_test', subscription_id='1aefdc5e-3a7c-4d71-a9f9-f5d3b03be19a', resource_group='rg_e2e_test_canary')

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

synapse_compute = ws.compute_targets[attached_synapse_name]
synapse_compute

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=1)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

In [None]:
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep, SynapseSparkStep
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

train_run_config = RunConfiguration()
conda = CondaDependencies.create(
    pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/master/588E708E0DF342C4A80BD954289657CF',
    pip_packages=['azureml-sdk<0.1.1', 'azureml-dataprep[fuse,pandas]>=1.1.19', 'azureml-telemetry'],
    pin_sdk_version=False
)

train_run_config.environment.python.conda_dependencies = conda

In [None]:
from azureml.data import HDFSOutputDatasetConfig

ds = Dataset.get_by_name(ws,name='blob_ds')
input1 = ds.as_named_input('synapseinput')

output1 = HDFSOutputDatasetConfig(
    "synapse_step_output", destination=(ws.datastores['workspaceblobstore'],"test1")).register_on_complete(name="registered_dataset")

input2 = output1.as_input("input2").as_download()


step_1 = SynapseSparkStep(name = 'synapse-spark',
                          file = 'pyspark_job_pipeline.py',
                          source_directory="./script", 
                          inputs=[input1],
                          outputs=[output1],
                          compute_target = synapse_compute,
                          driver_memory = "7g",
                          driver_cores = 4,
                          executor_memory = "7g",
                          executor_cores = 2,
                          num_executors = 1)

step_2 = PythonScriptStep(script_name="train.py",
                          arguments=[input2],
                          inputs=[input2],
                          compute_target=cpu_cluster_name,
                          runconfig = train_run_config,
                          source_directory="./script",
                          allow_reuse=False)

pipeline = Pipeline(workspace=ws, steps=[step_1, step_2])
pipeline_run = pipeline.submit('two_steps', regenerate_outputs=True)