# Run parallel batch inference at scale

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
dataset = ws.datasets["diabetes-tabular"]
compute_target = ws.compute_targets["cpu-cluster"]

In [None]:
from azureml.core import Dataset

# Create a big dataset (452.608 rows) named pending-diabetes
df = dataset.drop_columns("target").to_pandas_dataframe()

print(f"Original DataFrame's size {df.memory_usage(deep=True).sum()}")

for x in range(10):
    df = df.append(df)
print(f"Expanded DataFrame's size {df.memory_usage(deep=True).sum()}")
df.insert(0, "id", range(1, len(df) + 1))

dstore = ws.get_default_datastore()
pending_records_ds = Dataset.Tabular.register_pandas_dataframe(
    dataframe=df,
    target=(dstore, "/samples/pending-diabetes"),
    name="pending-diabetes",
    description="Pending diabetes records to be processed",
)

In [None]:
from azureml.pipeline.core import PipelineParameter
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig

ds_pipeline_param = PipelineParameter(name="dataset", default_value=pending_records_ds)
step01_input_dataset = DatasetConsumptionConfig("input_dataset", ds_pipeline_param)

In [None]:
from azureml.pipeline.steps import ParallelRunConfig

# Configure parallel step
parallel_run_config = ParallelRunConfig(
    source_directory="050_scripts",
    entry_script="tabular_batch.py",
    mini_batch_size="400Kb",
    error_threshold=-1,
    output_action="append_row",
    append_row_file_name="diabetes_outputs.txt",
    environment=ws.environments["AzureML-lightgbm-3.2-ubuntu18.04-py37-cpu"],
    compute_target=compute_target,
    node_count=2,
    process_count_per_node=10,
    run_invocation_timeout=600,
)

In [None]:
from azureml.data import OutputFileDatasetConfig

# Configure where to output inferences
datastore = ws.get_default_datastore()
step_output = OutputFileDatasetConfig(
    name="results_store", destination=(datastore, "/inferences/diabetes/")
)

In [None]:
from azureml.pipeline.steps import ParallelRunStep

parallel_step = ParallelRunStep(
    name="parallel-inference",
    inputs=[step01_input_dataset],
    output=step_output,
    parallel_run_config=parallel_run_config,
    allow_reuse=False,
)

In [None]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallel_step])

pipeline_run = Experiment(ws, "parallel-inference-run").submit(pipeline)