In [1]:
!ipcluster stop

2016-06-17 15:44:49.813 [IPClusterStop] CRITICAL | Could not read pid file, cluster is probably not running.


In [3]:
!ipcluster start -n=3 --daemon

In [2]:
from ipyparallel import Client
from sklearn.datasets import load_files
import numpy as np
from pprint import pprint
from sklearn.grid_search import ParameterGrid
from sklearn.svm import SVC
from sklearn.externals import joblib
from sklearn.cross_validation import ShuffleSplit
import os

In [6]:
client = Client()

len(client)

3

## Memmaping CV Splits for Multiprocess Dataset Sharing

We can leverage the previous tools to build a utility function that $\textbf{extracts Cross Validation splits ahead of time}$ to persist them on the hard drive in a format suitable for memmaping by IPython engine processes.

In [7]:
def persist_cv_splits(X, y, n_cv_iter=5, name='data',
    suffix="_cv_%03d.pkl", test_size=0.25, random_state=None):
    """Materialize randomized train test splits of a dataset."""

    cv = ShuffleSplit(X.shape[0], n_iter=n_cv_iter,
        test_size=test_size, random_state=random_state)
    cv_split_filenames = []
    
    for i, (train, test) in enumerate(cv):
        cv_fold = (X[train], y[train], X[test], y[test])
        cv_split_filename = name + suffix % i
        cv_split_filename = os.path.abspath(cv_split_filename)
        joblib.dump(cv_fold, cv_split_filename)
        cv_split_filenames.append(cv_split_filename)
    
    return cv_split_filenames

In [8]:
def compute_evaluation(cv_split_filename, model, params):
    """Function executed by a worker to evaluate a model on a CV split"""
    # All module imports should be executed in the worker namespace
    from sklearn.externals import joblib

    X_train, y_train, X_validation, y_validation = joblib.load(
        cv_split_filename, mmap_mode='c')
    
    model.set_params(**params)
    model.fit(X_train, y_train)
    validation_score = model.score(X_validation, y_validation)
    return validation_score

In [9]:
def grid_search(lb_view, model, cv_split_filenames, param_grid):
    """Launch all grid search evaluation tasks."""
    all_tasks = []
    all_parameters = list(ParameterGrid(param_grid))
    
    for i, params in enumerate(all_parameters):
        task_for_params = []
        
        for j, cv_split_filename in enumerate(cv_split_filenames):    
            t = lb_view.apply(
                compute_evaluation, cv_split_filename, model, params)
            task_for_params.append(t) 
        
        all_tasks.append(task_for_params)
        
    return all_parameters, all_tasks

In [10]:
def progress(tasks):
    return np.mean([task.ready() for task_group in tasks
                                 for task in task_group])

In [11]:
def find_bests(all_parameters, all_tasks, n_top=5):
    """Compute the mean score of the completed tasks"""
    mean_scores = []
    
    for param, task_group in zip(all_parameters, all_tasks):
        scores = [t.get() for t in task_group if t.ready()]
        if len(scores) == 0:
            continue
        mean_scores.append((np.mean(scores), param))
                   
    return sorted(mean_scores, reverse=True)[:n_top]

In [12]:
def get_sentiment_data(file_path):
    X = []
    y = []
    data_ = load_files(file_path, random_state=41)
    for label, data in zip(data_.target, data_.data):
        # FILTER OUT EMTPY REVIEWS 
        if data:
            X.append(data)
            y.append(label)
    return X, y

In [13]:
file_path = '/Users/Alexander/Documents/Data/sentiment/data/imdb1/'
X, Y = get_sentiment_data(file_path)

In [None]:
digits = load_digits()
digits_split_filenames = persist_cv_splits(X, Y,
    name='digits', random_state=42)
digits_split_filenames

In [None]:
ls -lh digits*

In [None]:
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0, 5),
}
pprint (svc_params)

In [None]:
list(ParameterGrid(svc_params))

In [None]:
lb_view = client.load_balanced_view()
model = SVC()
svc_params = {
    'C': np.logspace(-1, 2, 4),
    'gamma': np.logspace(-4, 0, 5),
}

all_parameters, all_tasks = grid_search(
   lb_view, model, digits_split_filenames, svc_params)

In [None]:
print("Tasks completed: {0}%".format(100 * progress(all_tasks)))

In [None]:
print("Tasks completed: {0}%".format(100 * progress(all_tasks)))
pprint(find_bests(all_parameters, all_tasks))