# Distributed XGBoost (CPU)

woohoo!

In [None]:
!pip install --upgrade dask==2.30.0 distributed==2.30.0 fastparquet adlfs xgboost

## Get Workspace

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

## Get Data

The data...

start a distributed Client

In [None]:
from distributed import Client

c = Client()
c

initialize the Pythonic filesystem

In [None]:
from adlfs import AzureBlobFileSystem

container_name = "malware"
storage_options = {"account_name": "azuremlexamples"}

fs = AzureBlobFileSystem(**storage_options)
fs

list the processed (partitioned) files

In [None]:
files = fs.ls(f"{container_name}/processed")
files

read data into a (dask) dataframe

In [None]:
import pandas as pd
import dask.dataframe as dd

for f in files:
    if "train" in f:
        df_train = dd.read_parquet(f"az://{f}", storage_options=storage_options)
    elif "test" in f:
        df_test = dd.read_parquet(f"az://{f}", storage_options=storage_options)

df_train

## Exploratory Data Analysis (EDA)

Explore the data...

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
%%time
df_train.describe().compute()

In [None]:
%%time
df_train["HasDetections"].compute().hist()

## Data Preparation

Prepare data for ML

throw out non-numeric columns...

In [None]:
cols = [col for col in df_train.columns if df_train.dtypes[col] != "object"]
cols

In [None]:
X = df_train[cols].drop("HasDetections", axis=1).values.persist()
X

In [None]:
y = df_train["HasDetections"].persist()
y

## Train XGBoost

In [None]:
import xgboost as xgb

dtrain = xgb.dask.DaskDMatrix(c, X, y)
dtrain

In [None]:
params = {
    "objective": "binary:logistic",
    "learning_rate": 0.1,
    "gamma": 0,
    "max_depth": 8,
}

In [None]:
%%time
model = xgb.dask.train(c, params, dtrain)

In [None]:
model

## Scale Out

We'll use ...

In [None]:
from azureml.core import ScriptRunConfig, Experiment, Environment
from azureml.core.runconfig import MpiConfiguration

env = Environment.from_conda_specification("xgboost-cpu-tutorial", "environment.yml")
mpi_config = MpiConfiguration(node_count=20)
src = ScriptRunConfig(
    source_directory="src",
    script="train.py",
    compute_target="cpu-cluster",
    environment=env,
    distributed_job_config=mpi_config,
)
run = Experiment(ws, "xgboost-cpu-tutorial").submit(src)
run

In [None]:
from azureml.widgets import RunDetails

RunDetails(run).show()

In [None]:
run.wait_for_completion()