# Define the preprosession Pipeline

## Import packages

In [6]:
from azureml.core import Workspace, Datastore, Dataset, Experiment

## Conenct to the Workspace and get the compute target

In [7]:
#connect to the workspace
ws = Workspace.from_config(".azure")

# get the compute target
compute_target = ws.compute_targets["cpu-cluster"]

## Define the Preprocessing script

In [12]:
%%writefile src/preprocessing.py

from azureml.core import Workspace, Datastore, Dataset, Run
import pandas as pd
import os

# get the current run
run = Run.get_context()
ws = run.experiment.workspace

# get the dataset
ds = Dataset.get_by_name(ws, "diabetes")
diabetes_df = ds.to_pandas_dataframe()

# preprocessing here:
diabetes_df = diabetes_df[(diabetes_df["pres"] != 0) & (diabetes_df["mass"] != 0) & (diabetes_df["plas"] != 0)]

# save the data to csv
os.mkdir("data")
local_path = 'data/diabetes_cleaned.csv'
diabetes_df.to_csv(local_path, index=False)

# upload the data
datastore = ws.get_default_datastore()
datastore.upload(src_dir='data', target_path='cleaned', overwrite=True)

# create a dataset referencing the cloud location
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, ('cleaned/diabetes_cleaned.csv'))])

# register new dataset
diabetes_cleaned_ds = dataset.register(workspace=ws, name='diabetes_cleaned',description='Diabetes training data', create_new_version=True)

Overwriting src/preprocessing.py


## Create the preprocessing Pipeline

In [13]:
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep

# define the pipeline step
data_prep_step = PythonScriptStep(
    script_name="preprocessing.py",
    source_directory="src",
    compute_target=compute_target)

# create the pipeline
prep_pipeline = Pipeline(workspace=ws, steps=[data_prep_step])

## Run the Pipeline

In [14]:
pipeline_run = Experiment(ws, 'preprocessing').submit(prep_pipeline)
pipeline_run.wait_for_completion(show_output=True)

Created step preprocessing.py [f7aaeb61][99388f1c-a5f8-48f1-a327-c9a52e7f8ec6], (This step will run and generate new outputs)
Submitted PipelineRun 1404716e-6189-4649-bac6-82a849d037be
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/preprocessing/runs/1404716e-6189-4649-bac6-82a849d037be?wsid=/subscriptions/3a0172d3-ec0d-46bb-a88a-ff41a302711a/resourcegroups/Evonik/workspaces/AMLWorkspace
PipelineRunId: 1404716e-6189-4649-bac6-82a849d037be
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/preprocessing/runs/1404716e-6189-4649-bac6-82a849d037be?wsid=/subscriptions/3a0172d3-ec0d-46bb-a88a-ff41a302711a/resourcegroups/Evonik/workspaces/AMLWorkspace
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: fa485a10-b3d9-498b-81e0-551d4d9ff8b0
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/preprocessing/runs/fa485a10-b3d9-498b-81e0-551d4d9ff8b0?wsid=/subscriptions/3a0172d3-ec0d-46bb-a88a-ff41a302711a/resourc

'Finished'