# A Guided Tour of Ray Core

In [1]:
from icecream import ic
import logging
import ray

ray.init(
    ignore_reinit_error=True,
    logging_level=logging.ERROR,
)

File descriptor limit 256 is too low for production servers and may result in connection errors. At least 8192 is recommended. --- Fix with 'ulimit -n 8192'


{'node_ip_address': '192.168.1.65',
 'raylet_ip_address': '192.168.1.65',
 'redis_address': '192.168.1.65:6379',
 'object_store_address': '/tmp/ray/session_2021-02-23_16-10-04_997200_93999/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-02-23_16-10-04_997200_93999/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-02-23_16-10-04_997200_93999',
 'metrics_export_port': 56217,
 'node_id': 'b6fa1741882fd380aec7d6b2b65ea1adffecf6c6'}

## JobLib

Set up for this example...

In [2]:
from ray.util.joblib import register_ray
from sklearn.datasets import load_digits
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
import numpy as np
import joblib

First, let's register Ray as the parallelized [*joblib*](https://scikit-learn.org/stable/modules/generated/sklearn.utils.parallel_backend.html) backend for `scikit-learn`, using  Ray actors instead of local processes.
This makes it easy to scale existing applications running on a single node to running on a cluster.

See: <https://docs.ray.io/en/master/joblib.html>

In [3]:
register_ray()

Next, load a copy of the UCI machine learning data repository's hand-written *digits* dataset.
See: <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html>

In [4]:
digits = load_digits()

We'll define the hyper-parameter space for training a *support vector machines* model:

In [5]:
param_space = {
    "C": np.logspace(-6, 6, 30),
    "gamma": np.logspace(-8, 8, 30),
    "tol": np.logspace(-4, -1, 30),
    "class_weight": [None, "balanced"],
}

model = SVC(kernel="rbf")

Then use a randomized search to optimize these hyper-parameters. See: <https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html>

We'll use 5 cross-validation splits and 10 iterations, which will result in a total of 50 "fits". This is enough to illustrate the `joblib` being parallelized, although in practice you'd probably use more iterations.

In [6]:
clf = RandomizedSearchCV(model, param_space, cv=5, n_iter=10, verbose=10)
clf

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'C': array([1.00000e-06, 2.59294e-06, 6.72336e-06, 1.74333e-05, 4.52035e-05,
       1.17210e-04, 3.03920e-04, 7.88046e-04, 2.04336e-03, 5.29832e-03,
       1.37382e-02, 3.56225e-02, 9.23671e-02, 2.39503e-01, 6.21017e-01,
       1.61026e+00, 4.17532e+00, 1.08264e+01, 2.80722e+01,..., 0.03039, 0.03857, 0.04894, 0.0621 ,
       0.0788 , 0.1    ]), 'class_weight': [None, 'balanced']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

Run the cross-validation fits (i.e., the random search for hyper-parameter optimization) using Ray to parallelize the backend processes:

In [7]:
with joblib.parallel_backend("ray"):
    search = clf.fit(digits.data, digits.target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996, score=0.8873626373626373, total=   0.3s
[CV] tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996, score=0.8425414364640884, total=   0.3s
[CV] tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV]  tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996, score=0.8746518105849582, total=   0.3s
[CV] tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.5s remaining:    0.0s


[CV]  tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996, score=0.9299719887955182, total=   0.3s
[CV] tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.0s remaining:    0.0s


[CV]  tol=0.1, gamma=3.562247890262444e-08, class_weight=balanced, C=188.73918221350996, score=0.8422535211267606, total=   0.3s
[CV] tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118 
[CV]  tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118, score=0.9560439560439561, total=   0.1s
[CV] tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118 
[CV]  tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118, score=0.9116022099447514, total=   0.1s
[CV] tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    2.6s remaining:    0.0s


[CV]  tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118, score=0.9665738161559888, total=   0.1s
[CV] tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118 
[CV]  tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118, score=0.9803921568627451, total=   0.1s
[CV] tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    2.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.9s remaining:    0.0s


[CV]  tol=0.00041753189365604, gamma=1e-08, class_weight=None, C=148735.21072935118, score=0.9352112676056338, total=   0.1s
[CV] tol=0.07880462815669913, gamma=0.0002592943797404667, class_weight=balanced, C=0.09236708571873865 
[CV]  tol=0.07880462815669913, gamma=0.0002592943797404667, class_weight=balanced, C=0.09236708571873865, score=0.9175824175824175, total=   0.2s
[CV] tol=0.07880462815669913, gamma=0.0002592943797404667, class_weight=balanced, C=0.09236708571873865 
[CV]  tol=0.07880462815669913, gamma=0.0002592943797404667, class_weight=balanced, C=0.09236708571873865, score=0.8839779005524862, total=   0.2s
[CV] tol=0.07880462815669913, gamma=0.0002592943797404667, class_weight=balanced, C=0.09236708571873865 
[CV]  tol=0.07880462815669913, gamma=0.0002592943797404667, class_weight=balanced, C=0.09236708571873865, score=0.9220055710306406, total=   0.2s
[CV] tol=0.07880462815669913, gamma=0.0002592943797404667, class_weight=balanced, C=0.09236708571873865 
[CV]  tol=0.07880

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   20.1s finished


So far, what is the best set of hyper-parameters found?

In [8]:
search.best_params_

{'tol': 0.00041753189365604,
 'gamma': 1e-08,
 'class_weight': None,
 'C': 148735.21072935118}