EpistasisLab · weixuanfu · Aug 29, 2018 · Jul 15, 2018 · Jul 16, 2018 · Jul 25, 2018
diff --git a/tpot/base.py b/tpot/base.py
@@ -46,6 +46,7 @@
 from tqdm import tqdm
 from copy import copy, deepcopy
 
+import dask
 from sklearn.base import BaseEstimator
 from sklearn.utils import check_X_y
 from sklearn.externals.joblib import Parallel, delayed, Memory
@@ -526,9 +527,9 @@ def fit(self, features, target, sample_weight=None, groups=None):
         target: array-like {n_samples}
             List of class labels for prediction
         sample_weight: array-like {n_samples}, optional
-            Per-sample weights. Higher weights indicate more importance. If specified, 
-            sample_weight will be passed to any pipeline element whose fit() function accepts 
-            a sample_weight argument. By default, using sample_weight does not affect tpot's 
+            Per-sample weights. Higher weights indicate more importance. If specified,
+            sample_weight will be passed to any pipeline element whose fit() function accepts
+            a sample_weight argument. By default, using sample_weight does not affect tpot's
             scoring functions, which determine preferences between pipelines.
         groups: array-like, with shape {n_samples, }, optional
             Group labels for the samples used when performing cross-validation.
@@ -1154,15 +1155,15 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non
             scoring_function=self.scoring_function,
             sample_weight=sample_weight,
             groups=groups,
-            timeout=self.max_eval_time_seconds
+            timeout=self.max_eval_time_seconds,
         )
 
         result_score_list = []
         # Don't use parallelization if n_jobs==1
         if self.n_jobs == 1:
             for sklearn_pipeline in sklearn_pipeline_list:
                 self._stop_by_max_time_mins()
-                val = partial_wrapped_cross_val_score(sklearn_pipeline=sklearn_pipeline)
+                val = partial_wrapped_cross_val_score(sklearn_pipeline=sklearn_pipeline, delayed=lambda x: x)
                 result_score_list = self._update_val(val, result_score_list)
         else:
             # chunk size for pbar update
@@ -1171,12 +1172,13 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non
             for chunk_idx in range(0, len(sklearn_pipeline_list), chunk_size):
                 self._stop_by_max_time_mins()
                 parallel = Parallel(n_jobs=self.n_jobs, verbose=0, pre_dispatch='2*n_jobs')
-                tmp_result_scores = parallel(delayed(partial_wrapped_cross_val_score)(sklearn_pipeline=sklearn_pipeline)
-                                             for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + chunk_size])
-                # update pbar
-                for val in tmp_result_scores:
-                    result_score_list = self._update_val(val, result_score_list)
+                tmp_result_scores = [partial_wrapped_cross_val_score(sklearn_pipeline=sklearn_pipeline,
+                                                                     delayed=dask.delayed)
+                                             for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + chunk_size]]
+                result_score_list.extend(tmp_result_scores)
 
+        result_score_list = dask.compute(*result_score_list)
+        self._update_pbar(len(result_score_list))
         self._update_evaluated_individuals_(result_score_list, eval_individuals_str, operator_counts, stats_dicts)
 
         """Look up the operator count and cross validation score to use in the optimization"""

diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py
@@ -23,6 +23,7 @@
 
 """
 
+import dask
 import numpy as np
 from deap import tools, gp
 from inspect import isclass
@@ -395,7 +396,8 @@ def mutNodeReplacement(individual, pset):
 
 @threading_timeoutable(default="Timeout")
 def _wrapped_cross_val_score(sklearn_pipeline, features, target,
-                             cv, scoring_function, sample_weight=None, groups=None):
+                             cv, scoring_function, sample_weight=None,
+                             groups=None, delayed=lambda x: x):
     """Fit estimator and compute scores for a given dataset split.
     Parameters
     ----------
@@ -425,24 +427,28 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target,
 
     cv = check_cv(cv, target, classifier=is_classifier(sklearn_pipeline))
     cv_iter = list(cv.split(features, target, groups))
-    scorer = check_scoring(sklearn_pipeline, scoring=scoring_function)
-
-    try:
-        with warnings.catch_warnings():
-            warnings.simplefilter('ignore')
-            scores = [_fit_and_score(estimator=clone(sklearn_pipeline),
-                                    X=features,
-                                    y=target,
-                                    scorer=scorer,
-                                    train=train,
-                                    test=test,
-                                    verbose=0,
-                                    parameters=None,
-                                    fit_params=sample_weight_dict)
-                                for train, test in cv_iter]
-            CV_score = np.array(scores)[:, 0]
-            return np.nanmean(CV_score)
-    except TimeoutException:
-        return "Timeout"
-    except Exception as e:
-        return -float('inf')
+    scorer = delayed(check_scoring)(sklearn_pipeline, scoring=scoring_function)
+
+    def safe_fit_and_score(*args, **kwargs):
+        try:
+            return _fit_and_score(*args, **kwargs)
+        except Exception:
+            return -float('inf')
+
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore')
+        # TODO: dive into and delay fit/transform calls on sklearn_pipeline.steps appropriately
+        # This will help with shared intermediate results, profiling, etc..
+        # It looks like the dask_ml.model_selection._search.do_fit_and_score might have good logic here
+        scores = [delayed(safe_fit_and_score)(estimator=delayed(clone)(sklearn_pipeline),
+                                X=features,
+                                y=target,
+                                scorer=scorer,
+                                train=train,
+                                test=test,
+                                verbose=0,
+                                parameters=None,
+                                fit_params=sample_weight_dict)
+                            for train, test in cv_iter]
+        CV_score = delayed(np.array)(scores)[:, 0]
+        return delayed(np.nanmean)(CV_score)