<img src="https://avatars2.githubusercontent.com/u/365630?v=3&s=400"
     align="right"
     width="20%">

Custom computations with Scikit Learn
-------------------------------------

*Courtesy of [Olivier Grisel](http://ogrisel.com)*

[original notebook](https://github.com/ogrisel/docker-distributed/blob/master/examples/sklearn_parameter_search.ipynb) and [slides](https://t.co/sclUvg3U7w)

In [None]:
from distributed import Executor, progress
e = Executor('127.0.0.1:8786')

In [None]:
e.restart()

## Hyperparameter search for sklearn

In [None]:
from sklearn.datasets import load_digits
import numpy as np

digits = load_digits()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.imshow(digits.data[36].reshape(8, 8),
           interpolation='nearest', cmap='gray');

### Sequential random parameter search

In [None]:
from sklearn.cross_validation import train_test_split

def load_cv_split(split_idx):
    data = load_digits()
    splitted = train_test_split(data.data, data.target,
                                test_size=0.20,
                                random_state=split_idx)
    return split_idx, splitted

In [None]:
from time import time
from sklearn.externals.joblib import hash


def evaluate_one(model_class, parameters, cv_split):
    split_idx, (X_train, X_val, y_train, y_val) = cv_split
    t0 = time()
    model = model_class(**parameters).fit(X_train, y_train)
    training_time = time() - t0

    train_score = model.score(X_train, y_train)
    validation_score = model.score(X_val, y_val)
    
    results = {
        'split_idx': split_idx,
        'training_time': training_time,
        'train_score': train_score,
        'val_score': validation_score,
        'parameters': parameters,
        'parameters_hash': hash(parameters),
    }
    return results

In [None]:
%%time
from sklearn.svm import SVC
from sklearn.grid_search import ParameterSampler
import numpy as np
import pandas as pd

param_grid = {
    'C': np.logspace(-10, 10, 1001),
    'gamma': np.logspace(-10, 10, 1001),
    'tol': np.logspace(-4, -1, 4),
}

param_space = ParameterSampler(param_grid, 3)

cv_splits = [load_cv_split(i) for i in range(2)]

results = [evaluate_one(SVC, params, split)
           for split in cv_splits
           for params in param_space]

In [None]:
pd.DataFrame.from_dict(results).drop('parameters_hash', 1)

### Scaling out with dask + distributed

In [None]:
n_cores = sum(e.ncores().values())
n_cores

In [None]:
from dask import delayed, visualize

param_space = ParameterSampler(param_grid, n_cores * 10)
cv_splits = [delayed(load_cv_split)(i) for i in range(3)]
delayed_evaluations = [delayed(evaluate_one)(SVC, params, split)
                       for split in cv_splits
                       for params in param_space]

In [None]:
delayed_evaluations[:5]

In [None]:
all_results = e.compute(delayed_evaluations)

In [None]:
progress(all_results)

In [None]:
results = e.gather([f for f in all_results if f.done()])
results = pd.DataFrame.from_dict(results)
results.describe()

### Aggregation of scores across CV folds

In [None]:
mean_evaluations = results.groupby('parameters_hash').agg({
    'train_score': np.mean,
    'val_score': np.mean,
    'training_time': np.mean,
}).reset_index()

all_parameters = pd.DataFrame.from_dict(list(results['parameters']))
all_parameters['parameters_hash'] = results['parameters_hash']

evaluations = (
    mean_evaluations
    .merge(all_parameters)
    .drop(['parameters_hash'], axis=1)
)
top10 = evaluations.sort_values(
    by='val_score', ascending=False).head(10)
top10

In [None]:
import matplotlib.pyplot as plt

def plot_param_map(df, target, title):
    plt.xlabel('log10(C)')
    plt.ylabel('log10(gamma)')
    plt.xlim(-10, 10)
    plt.ylim(-10, 10)
    plt.scatter(np.log10(df['C']), np.log10(df['gamma']),
                c=target,
                marker='s', edgecolors='none',
                s=80, alpha=1, cmap='viridis')
    plt.colorbar()
    plt.title(title)

plt.figure(figsize=(6, 5))
plot_param_map(evaluations, evaluations['val_score'],
               'validation score')
plt.scatter(np.log10(top10['C']), np.log10(top10['gamma']),
            c='r', s=50);

## Scaling down

### Single local python process, multiple threads

In [None]:
import dask
%time _ = dask.compute(*delayed_evaluations[:10])

### Single local python thread, no parallelism

In [None]:
from dask.async import get_sync
%time _ = dask.compute(*delayed_evaluations[:10], get=get_sync)

In [None]:
%prun _ = dask.compute(*delayed_evaluations[:10], get=get_sync)