# Dask at CHTC Demo

In [None]:
%matplotlib inline

## Creating a Cluster and Client

In [None]:
from dask_chtc import CHTCCluster

cluster = CHTCCluster(worker_image = "maventree/dask-worker:demo")
cluster.adapt(minimum=10, maximum=100)
cluster

In [None]:
from dask.distributed import Client

client = Client(cluster)
client

## Low-Level Parallelism

In [None]:
import dask
import dask.array as da

x = da.ones((15, 15), chunks=5)

In [None]:
y = x + x.T
y

In [None]:
y.visualize()

In [None]:
z = y.compute()

In [None]:
x = da.random.random((10000, 10000), chunks=1000)
x

In [None]:
y = ((x ** 2) + 1).sum()
y

In [None]:
z = y.compute()

Dask provides an implemention of SVD:

In [None]:
x = da.ones((10000, 1000), chunks=(1000, 1000))
u, s, v = da.linalg.svd(x)

In [None]:
s

In [None]:
s.visualize()

In [None]:
u, s, v = dask.compute(u, s, v)

## Machine Learning and High-Level Parallelism

Dask-ML provides Dask implemenations of various algorithms commonly used in ML, like clustering:

In [None]:
import dask_ml.datasets
import dask_ml.cluster
import matplotlib.pyplot as plt

In [None]:
X, y = dask_ml.datasets.make_blobs(n_samples=10_000_000,
                                   chunks=1_000_000,
                                   centers=5,
                                   center_box = (-10, 10),
                                   random_state=11)
X = X.persist()
X

In [None]:
DENSITY = 1000

fig, ax = plt.subplots()
ax.scatter(X[::DENSITY, 0], X[::DENSITY, 1], 
           marker='.');

In [None]:
km = dask_ml.cluster.KMeans(n_clusters=5, init_max_iter=2, oversampling_factor=10)
km

In [None]:
km.fit(X)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X[::DENSITY, 0], X[::DENSITY, 1], 
           marker='.', 
           c=km.labels_[::DENSITY],
           cmap='viridis', alpha=0.25);

In [None]:
client.cancel(X)

And advanced hyperparameter optimization techniques (https://examples.dask.org/machine-learning/hyperparam-opt.html):

In [None]:
from sklearn.datasets import make_circles
from sklearn.utils import check_random_state
import numpy as np
import pandas as pd

X, y = make_circles(n_samples=100_000, random_state=0, noise=0.13)

pd.DataFrame({0: X[:, 0], 1: X[:, 1], "class": y}).sample(4_000).plot.scatter(
    x=0, y=1, alpha=0.2, c="class", cmap="bwr"
)

rng = check_random_state(42)
random_feats = rng.uniform(-1, 1, size=(X.shape[0], 4))
X = np.hstack((X, random_feats))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5_000, random_state=42)

scaler = StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import numpy as np
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()

params = {
    "hidden_layer_sizes": [
        (24, ),
        (12, 12),
        (6, 6, 6, 6),
        (4, 4, 4, 4, 4, 4),
        (12, 6, 3, 3),
    ],
    "activation": ["relu", "logistic", "tanh"],
    "alpha": np.logspace(-6, -3, num=1000),
    "batch_size": [16, 32, 64, 128, 256, 512],
}

In [None]:
n_examples = 30 * len(X_train)
n_params = 15

max_iter = n_params  # number of times partial_fit will be called
chunks = n_examples // n_params  # number of examples each call sees

X_train2 = da.from_array(X_train, chunks=chunks)
y_train2 = da.from_array(y_train, chunks=chunks)
X_train2

In [None]:
from dask_ml.model_selection import HyperbandSearchCV

search = HyperbandSearchCV(
    model,
    params,
    max_iter=max_iter,
    patience=True,
)
search

In [None]:
search.fit(X_train2, y_train2, classes=[0, 1, 2, 3])

In [None]:
search.best_estimator_