# Distributed PyTorch Lightning with Horovod

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

In [None]:
import git
from pathlib import Path

# get root of git repo
prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir)

# training script
source_dir = prefix.joinpath(
    "code", "models", "pytorch-lightning", "mnist-autoencoder"
)
script_name = "train.py"

# environment file
environment_file = prefix.joinpath("environments", "pt-lightning-horovod.yml")

# azure ml settings
environment_name = "pt-lightning-horovod"
experiment_name = "pt-lightning-horovod-example"
cluster_name = "gpu-k80-2"

In [None]:
print(open(source_dir.joinpath(script_name)).read())

## Create environment

In [None]:
from azureml.core import Environment

env = Environment.from_conda_specification(environment_name, environment_file)

# specify a GPU base image
env.docker.enabled = True
env.docker.base_image = (
    "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"
)

## Configure and run training job
Create a ScriptRunConfig to specify the training script & arguments, environment, and cluster to run on.

To use Horovod for distributed training with PyTorch Lightning, specify the distributed mode, in this case `"horovod"`, to the `--distributed_backend` argument. To enable GPU training, set `--gpus=1`. Note that this is just to configure Lightning to use GPUs for training rather than CPUs, which is the default if `--gpus` is not set. 

To actually configure the number of GPUs per node and number of nodes for your training job, specify that information using with the MpiConfiguration object. Azure ML will use this information to configure the number of worker processes with the driver application (*mpirun*) that it uses to start the job.

For more information on using Horovod with Lightning, see the [documentation](https://pytorch-lightning.readthedocs.io/en/latest/multi_gpu.html#horovod).

In [None]:
import os
from azureml.core import ScriptRunConfig, Experiment
from azureml.core.runconfig import MpiConfiguration

cluster = ws.compute_targets[cluster_name]

src = ScriptRunConfig(
    source_directory=source_dir,
    script=script_name,
    arguments=["--max_epochs", 25, "--gpus", 1, "--distributed_backend", "horovod"],
    compute_target=cluster,
    environment=env,
    distributed_job_config=MpiConfiguration(process_count_per_node=2, node_count=1),
)

run = Experiment(ws, experiment_name).submit(src)
run

In [None]:
from azureml.widgets import RunDetails

RunDetails(run).show()

In [None]:
run.wait_for_completion(show_output=True)