From f505039c7939aee4b497985b97bbcc9119470646 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Fri, 13 Apr 2018 13:43:54 -0400 Subject: [PATCH 01/60] add more GP type for strongly typed GP --- tpot/base.py | 2 +- tpot/gp_types.py | 9 ++++++++- tpot/operator_utils.py | 18 ++++++++++++------ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index d325985b..9549b325 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -70,7 +70,7 @@ from .config.classifier_sparse import classifier_config_sparse from .metrics import SCORERS -from .gp_types import Output_Array +from .gp_types import Output_Array, Transformed_Array, Selected_Array from .gp_deap import eaMuPlusLambda, mutNodeReplacement, _wrapped_cross_val_score, cxOnePoint # hot patch for Windows: solve the problem of crashing python after Ctrl + C in Windows OS diff --git a/tpot/gp_types.py b/tpot/gp_types.py index 1d4e8459..fa507d8b 100644 --- a/tpot/gp_types.py +++ b/tpot/gp_types.py @@ -24,6 +24,13 @@ """ class Output_Array(object): - """Output data type of pipelines.""" + """Final output data type of pipelines.""" + pass + +class Transformed_Array(object): + """Transformed data returned by Transformer.""" + pass +class Selected_Array(object): + """Transformed data returned by Selector.""" pass diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index 9407964f..453359d6 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -25,6 +25,7 @@ import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +from sklearn.feature_selection.base import SelectorMixin import inspect @@ -157,23 +158,28 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass= dep_op_list = {} # list of nested estimator/callable function dep_op_type = {} # type of nested estimator/callable function import_str, op_str, op_obj = source_decode(opsourse) - + class_profile['root'] = False if not op_obj: return None, None else: # define if the operator can be the root of a pipeline - if issubclass(op_obj, ClassifierMixin) or issubclass(op_obj, RegressorMixin): + if issubclass(op_obj, ClassifierMixin): class_profile['root'] = True - optype = "Classifier or Regressor" - else: - optype = "Preprocessor or Selector" + optype = "Classifier" + elif issubclass(op_obj, RegressorMixin): + class_profile['root'] = True + optype = "Regressor" + elif issubclass(op_obj, TransformerMixin): + optype = "Transformer" + elif issubclass(op_obj, SelectorMixin): + optype = "Selector" @classmethod def op_type(cls): """Return the operator type. Possible values: - "Classifier", "Regressor", "Selector", "Preprocessor" + "Classifier", "Regressor", "Selector", "Transformer" """ return optype From fa96afe9b84cb6e3d1df54db04ea93077db641b3 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Fri, 13 Apr 2018 14:24:30 -0400 Subject: [PATCH 02/60] clean up codes --- tpot/operator_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index 453359d6..87f90bb3 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -158,7 +158,7 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass= dep_op_list = {} # list of nested estimator/callable function dep_op_type = {} # type of nested estimator/callable function import_str, op_str, op_obj = source_decode(opsourse) - class_profile['root'] = False + if not op_obj: return None, None else: From 3a44c4229cd8c5eb071bf2c3d5a35ce73b9171b9 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Fri, 13 Apr 2018 14:36:28 -0400 Subject: [PATCH 03/60] add tempate param --- tpot/base.py | 4 +++- tpot/operator_utils.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 9549b325..387b3388 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -102,7 +102,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, scoring=None, cv=5, subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, - random_state=None, config_dict=None, + random_state=None, config_dict=None, template=None, warm_start=False, memory=None, periodic_checkpoint_folder=None, early_stop=None, verbosity=0, disable_update_check=False): @@ -200,6 +200,8 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, String 'TPOT sparse': TPOT uses a configuration dictionary with a one-hot-encoder and the operators normally included in TPOT that also support sparse matrices. + template: (default: None) + A template for pipeline structure warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index 87f90bb3..75155f2f 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -228,7 +228,7 @@ def parameter_types(cls): operator """ - return ([np.ndarray] + arg_types, np.ndarray) + return ([np.ndarray] + arg_types, np.ndarray) # (input types, return types) class_profile['parameter_types'] = parameter_types From fbb78b58f92a444880b28b9c4602760cfb77d90b Mon Sep 17 00:00:00 2001 From: Weixuan Fu Date: Mon, 16 Apr 2018 10:59:09 -0400 Subject: [PATCH 04/60] add fix length param --- tpot/base.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 387b3388..09fe45e0 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -200,7 +200,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, String 'TPOT sparse': TPOT uses a configuration dictionary with a one-hot-encoder and the operators normally included in TPOT that also support sparse matrices. - template: (default: None) + template: Python list (default: None) A template for pipeline structure warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from @@ -286,6 +286,18 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self.config_dict_params = config_dict self._setup_config(self.config_dict_params) + self.template = template.split('-') + self.fixed_length = len(self.template) + # for now, the template is only a linear pipeline + # will add supports for more complex structure + + if self.fixed_length: ++ self._min = self.fixed_length ++ self._max = self.fixed_length + 1 ++ else: ++ self._min = 1 ++ self._max = 3 + self.operators = [] self.arguments = [] for key in sorted(self.config_dict.keys()): @@ -496,12 +508,16 @@ def _setup_toolbox(self): creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMulti, statistics=dict) self._toolbox = base.Toolbox() - self._toolbox.register('expr', self._gen_grow_safe, pset=self._pset, min_=1, max_=3) + self._toolbox.register('expr', self._gen_grow_safe, pset=self._pset, min_=self._min, max_=self._max) self._toolbox.register('individual', tools.initIterate, creator.Individual, self._toolbox.expr) self._toolbox.register('population', tools.initRepeat, list, self._toolbox.individual) self._toolbox.register('compile', self._compile_to_sklearn) self._toolbox.register('select', tools.selNSGA2) self._toolbox.register('mate', self._mate_operator) + if self.fixed_length: ++ self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max) ++ else: ++ self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max + 1) self._toolbox.register('expr_mut', self._gen_grow_safe, min_=1, max_=4) self._toolbox.register('mutate', self._random_mutation_operator) From c9a4aaea35606098cf57cdc2b99a9d429815da7a Mon Sep 17 00:00:00 2001 From: Weixuan Fu Date: Mon, 16 Apr 2018 11:55:45 -0400 Subject: [PATCH 05/60] inital trial for template function --- tpot/base.py | 109 ++++++++++++++++++++++++++++------------------- tpot/gp_types.py | 8 ---- 2 files changed, 66 insertions(+), 51 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 09fe45e0..cf3d9f50 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -70,7 +70,7 @@ from .config.classifier_sparse import classifier_config_sparse from .metrics import SCORERS -from .gp_types import Output_Array, Transformed_Array, Selected_Array +from .gp_types import Output_Array from .gp_deap import eaMuPlusLambda, mutNodeReplacement, _wrapped_cross_val_score, cxOnePoint # hot patch for Windows: solve the problem of crashing python after Ctrl + C in Windows OS @@ -286,8 +286,8 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self.config_dict_params = config_dict self._setup_config(self.config_dict_params) - self.template = template.split('-') - self.fixed_length = len(self.template) + self.template = template + self.fixed_length = len(template.split('-').template) # for now, the template is only a linear pipeline # will add supports for more complex structure @@ -467,31 +467,61 @@ def _setup_pset(self): if self.verbosity > 2: print('{} operators have been imported by TPOT.'.format(len(self.operators))) + def _add_operators(self): - for operator in self.operators: - if operator.root: - # We need to add rooted primitives twice so that they can - # return both an Output_Array (and thus be the root of the tree), - # and return a np.ndarray so they can exist elsewhere in the tree. - p_types = (operator.parameter_types()[0], Output_Array) - self._pset.addPrimitive(operator, *p_types) - - self._pset.addPrimitive(operator, *operator.parameter_types()) - - # Import required modules into local namespace so that pipelines - # may be evaluated directly - for key in sorted(operator.import_hash.keys()): - module_list = ', '.join(sorted(operator.import_hash[key])) - - if key.startswith('tpot.'): - exec('from {} import {}'.format(key[4:], module_list)) + main_type = ["Classifier", "Regressor", "Selector", "Transformer"] + if self.template: + steps = self.template.split('-') + ret_types = [] + for idx, step in enumerate(steps): + if idx < self.fixed_length - 1: + # create an empty for returning class for strongly-type GP + step_ret_type_name = 'Ret_{}'.format(idx) + step_ret_type = type(step_ret_type_name, (object,), {}) + ret_types.append(step_ret_type) + else: + step_ret_type = Output_Array + # input class in each step + if idx: + step_in_type = np.ndarray else: - exec('from {} import {}'.format(key, module_list)) + step_in_type = ret_types[idx-1] + if main_type.count(step): # if the step is a main type + for operator in self.operators: + if operator.optype() == step: + p_types = ([step_in_type] + operator.parameter_types()[0][1:], step_ret_type) + self._pset.addPrimitive(operator, *p_types) + else: # is the step is a specific operator + for operator in self.operators: + if operator.__name__ == step: + p_types = ([step_in_type] + operator.parameter_types()[0][1:], step_ret_type) + self._pset.addPrimitive(operator, *p_types) + else: # no template and randomly generated pipeline + for operator in self.operators: + if operator.root: + # We need to add rooted primitives twice so that they can + # return both an Output_Array (and thus be the root of the tree), + # and return a np.ndarray so they can exist elsewhere in the tree. + p_types = (operator.parameter_types()[0], Output_Array) + self._pset.addPrimitive(operator, *p_types) + + self._pset.addPrimitive(operator, *operator.parameter_types()) + + # Import required modules into local namespace so that pipelines + # may be evaluated directly + for key in sorted(operator.import_hash.keys()): + module_list = ', '.join(sorted(operator.import_hash[key])) + + if key.startswith('tpot.'): + exec('from {} import {}'.format(key[4:], module_list)) + else: + exec('from {} import {}'.format(key, module_list)) - for var in operator.import_hash[key]: - self.operators_context[var] = eval(var) + for var in operator.import_hash[key]: + self.operators_context[var] = eval(var) + + self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray) - self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray) def _add_terminals(self): for _type in self.arguments: @@ -1368,15 +1398,17 @@ def _random_mutation_operator(self, individual, allow_shrink=True): Returns the individual with one of the mutations applied to it """ - mutation_techniques = [ - partial(gp.mutInsert, pset=self._pset), - partial(mutNodeReplacement, pset=self._pset) - ] - - # We can't shrink pipelines with only one primitive, so we only add it if we find more primitives. - number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual]) - if number_of_primitives > 1 and allow_shrink: - mutation_techniques.append(partial(gp.mutShrink)) + if self.fixed_length: + mutation_techniques = [partial(mutNodeReplacement, pset=self._pset)] + else: + mutation_techniques = [ + partial(gp.mutInsert, pset=self._pset), + partial(mutNodeReplacement, pset=self._pset) + ] + # We can't shrink pipelines with only one primitive, so we only add it if we find more primitives. + number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual]) + if number_of_primitives > 1 and allow_shrink: + mutation_techniques.append(partial(gp.mutShrink)) mutator = np.random.choice(mutation_techniques) @@ -1386,15 +1418,6 @@ def _random_mutation_operator(self, individual, allow_shrink=True): ind = self._toolbox.clone(individual) offspring, = mutator(ind) if str(offspring) not in self.evaluated_individuals_: - # Update statistics - # crossover_count is kept the same as for the predecessor - # mutation count is increased by 1 - # predecessor is set to the string representation of the individual before mutation - # generation is set to 'INVALID' such that we can recognize that it should be updated accordingly - offspring.statistics['crossover_count'] = individual.statistics['crossover_count'] - offspring.statistics['mutation_count'] = individual.statistics['mutation_count'] + 1 - offspring.statistics['predecessor'] = (str(individual),) - offspring.statistics['generation'] = 'INVALID' break else: unsuccesful_mutations += 1 @@ -1402,7 +1425,7 @@ def _random_mutation_operator(self, individual, allow_shrink=True): # Sometimes you have pipelines for which every shrunk version has already been explored too. # To still mutate the individual, one of the two other mutators should be applied instead. if ((unsuccesful_mutations == 50) and - (type(mutator) is partial and mutator.func is gp.mutShrink)): + (type(mutator) is partial and mutator.func is gp.mutShrink)): offspring, = self._random_mutation_operator(individual, allow_shrink=False) return offspring, diff --git a/tpot/gp_types.py b/tpot/gp_types.py index fa507d8b..d479b653 100644 --- a/tpot/gp_types.py +++ b/tpot/gp_types.py @@ -26,11 +26,3 @@ class Output_Array(object): """Final output data type of pipelines.""" pass - -class Transformed_Array(object): - """Transformed data returned by Transformer.""" - pass - -class Selected_Array(object): - """Transformed data returned by Selector.""" - pass From bdd7f7d4d2213748c34f94cf5c9f0d18c142e3eb Mon Sep 17 00:00:00 2001 From: Weixuan Fu Date: Mon, 16 Apr 2018 13:08:07 -0400 Subject: [PATCH 06/60] template works --- tpot/base.py | 83 ++++++++++++++++++++++++------------------ tpot/operator_utils.py | 4 +- 2 files changed, 50 insertions(+), 37 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index cf3d9f50..ef69b749 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -287,21 +287,21 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self._setup_config(self.config_dict_params) self.template = template - self.fixed_length = len(template.split('-').template) - # for now, the template is only a linear pipeline - # will add supports for more complex structure - - if self.fixed_length: -+ self._min = self.fixed_length -+ self._max = self.fixed_length + 1 -+ else: -+ self._min = 1 -+ self._max = 3 + if template: + self.fixed_length = len(template.split('-')) + # for now, the template is only a linear pipeline + # will add supports for more complex structure + self._min = self.fixed_length + self._max = self.fixed_length + 1 + else: + self.fixed_length = 0 + self._min = 1 + self._max = 3 self.operators = [] - self.arguments = [] + for key in sorted(self.config_dict.keys()): - op_class, arg_types = TPOTOperatorClassFactory( + op_class, _ = TPOTOperatorClassFactory( key, self.config_dict[key], BaseClass=Operator, @@ -309,7 +309,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, ) if op_class: self.operators.append(op_class) - self.arguments += arg_types # Schedule TPOT to run for many generations if the user specifies a # run-time limit TPOT will automatically interrupt itself when the timer @@ -462,7 +461,6 @@ def _setup_pset(self): self._pset = gp.PrimitiveSetTyped('MAIN', [np.ndarray], Output_Array) self._pset.renameArguments(ARG0='input_matrix') self._add_operators() - self._add_terminals() if self.verbosity > 2: print('{} operators have been imported by TPOT.'.format(len(self.operators))) @@ -470,9 +468,9 @@ def _setup_pset(self): def _add_operators(self): main_type = ["Classifier", "Regressor", "Selector", "Transformer"] + ret_types = [] if self.template: steps = self.template.split('-') - ret_types = [] for idx, step in enumerate(steps): if idx < self.fixed_length - 1: # create an empty for returning class for strongly-type GP @@ -483,21 +481,28 @@ def _add_operators(self): step_ret_type = Output_Array # input class in each step if idx: - step_in_type = np.ndarray - else: step_in_type = ret_types[idx-1] + else: + step_in_type = np.ndarray if main_type.count(step): # if the step is a main type for operator in self.operators: - if operator.optype() == step: - p_types = ([step_in_type] + operator.parameter_types()[0][1:], step_ret_type) + arg_types = operator.parameter_types()[0][1:] + if operator.type() == step: + p_types = ([step_in_type] + arg_types, step_ret_type) self._pset.addPrimitive(operator, *p_types) + self._import_hash(operator) + self._add_terminals(arg_types) else: # is the step is a specific operator for operator in self.operators: + arg_types = operator.parameter_types()[0][1:] if operator.__name__ == step: - p_types = ([step_in_type] + operator.parameter_types()[0][1:], step_ret_type) + p_types = ([step_in_type] + arg_types, step_ret_type) self._pset.addPrimitive(operator, *p_types) + self._import_hash(operator) + self._add_terminals(arg_types) else: # no template and randomly generated pipeline for operator in self.operators: + arg_types = operator.parameter_types()[0][1:] if operator.root: # We need to add rooted primitives twice so that they can # return both an Output_Array (and thus be the root of the tree), @@ -509,22 +514,30 @@ def _add_operators(self): # Import required modules into local namespace so that pipelines # may be evaluated directly - for key in sorted(operator.import_hash.keys()): - module_list = ', '.join(sorted(operator.import_hash[key])) + self._import_hash(operator) + self._add_terminals(arg_types) - if key.startswith('tpot.'): - exec('from {} import {}'.format(key[4:], module_list)) - else: - exec('from {} import {}'.format(key, module_list)) - - for var in operator.import_hash[key]: - self.operators_context[var] = eval(var) self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray) + self.ret_types = [np.ndarray, Output_Array] + ret_types - def _add_terminals(self): - for _type in self.arguments: + def _import_hash(self, operator): + # Import required modules into local namespace so that pipelines + # may be evaluated directly + for key in sorted(operator.import_hash.keys()): + module_list = ', '.join(sorted(operator.import_hash[key])) + + if key.startswith('tpot.'): + exec('from {} import {}'.format(key[4:], module_list)) + else: + exec('from {} import {}'.format(key, module_list)) + + for var in operator.import_hash[key]: + self.operators_context[var] = eval(var) + + def _add_terminals(self, arg_types): + for _type in arg_types: type_values = list(_type.values) for val in type_values: @@ -545,9 +558,9 @@ def _setup_toolbox(self): self._toolbox.register('select', tools.selNSGA2) self._toolbox.register('mate', self._mate_operator) if self.fixed_length: -+ self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max) -+ else: -+ self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max + 1) + self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max) + else: + self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max + 1) self._toolbox.register('expr_mut', self._gen_grow_safe, min_=1, max_=4) self._toolbox.register('mutate', self._random_mutation_operator) @@ -1453,7 +1466,7 @@ def _gen_grow_safe(self, pset, min_, max_, type_=None): def condition(height, depth, type_): """Stop when the depth is equal to height or when a node should be a terminal.""" - return type_ not in [np.ndarray, Output_Array] or depth == height + return type_ not in self.ret_types or depth == height return self._generate(pset, min_, max_, condition, type_) diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index 75155f2f..de147870 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -169,9 +169,9 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass= elif issubclass(op_obj, RegressorMixin): class_profile['root'] = True optype = "Regressor" - elif issubclass(op_obj, TransformerMixin): + if issubclass(op_obj, TransformerMixin): optype = "Transformer" - elif issubclass(op_obj, SelectorMixin): + if issubclass(op_obj, SelectorMixin): optype = "Selector" @classmethod From 87bf8c82f9065f5858019c8109cf2dc2b7fea6bc Mon Sep 17 00:00:00 2001 From: Trang Le Date: Wed, 18 Apr 2018 10:42:00 -0400 Subject: [PATCH 07/60] Dataset selector First draft: Dataset selector as additional TPOT's operator --- tpot/builtins/DatasetSelector.py | 75 ++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 tpot/builtins/DatasetSelector.py diff --git a/tpot/builtins/DatasetSelector.py b/tpot/builtins/DatasetSelector.py new file mode 100644 index 00000000..2c6f0105 --- /dev/null +++ b/tpot/builtins/DatasetSelector.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Mar 23 2018 + +@author: grixor +""" +import numpy as np +import pandas as pd +import os, os.path +from sklearn.base import BaseEstimator, TransformerMixin + + +class DatasetSelector(BaseEstimator): + """Select predefined data subsets.""" + + def __init__(self, subset_dir=None): + """Create a DatasetSelector object. + + Parameters + ---------- + subset_dir: directory, required + Path to folder that stores the feature list files. Currently, + each file needs to be a .csv with one header row. The feature + names in these files must match those in the (training and + testing) dataset. + + Returns + ------- + None + + """ + self.subset_dir = subset_dir + + def get_subset(self, input_data, input_target): + """Fit an optimized machine learning pipeline using TPOT. + + Uses genetic programming to optimize a machine learning pipeline that + maximizes score on the provided features and target. Performs internal + k-fold cross-validaton to avoid overfitting on the provided data. The + best pipeline is then trained on the entire set of provided samples. + + Parameters + ---------- + input_data: array-like {n_samples, n_features} + Feature matrix + + input_target: array-like {n_samples} + List of class labels for prediction + + Returns + ------- + self.data_subset: object + Returns a list of subsets of input_data + + """ + + self.input_data = input_data + self.input_target = input_target + self.feature_names = list(self.input_data.columns.values) + + self.subset_files = os.listdir(self.subset_dir) + self.num_subset = len(self.subset_files) + self.feature_set = {} + self.data_subset = {} + self.population_size = population_size + + for i in range(self.num_subset): + self.subset_i = self.subset_dir + "/" + self.subset_files[i] + self.features_i_df = pd.read_csv(self.subset_i, sep='\t', header=0) + self.feature_i = set(features_i_df.values.flatten()) + self.feature_set[i] = list(feature_i.intersection(set(self.feature_names))) + self.data_subset[i] = self.input_data[self.feature_set[i]] + + return self.data_subset From 4b1e6a4c64c50d553726bfc2a60436276adb9186 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Thu, 19 Apr 2018 12:02:29 -0400 Subject: [PATCH 08/60] change filename to lowercase for dataset_selector --- tpot/builtins/{DatasetSelector.py => dataset_selector.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tpot/builtins/{DatasetSelector.py => dataset_selector.py} (100%) diff --git a/tpot/builtins/DatasetSelector.py b/tpot/builtins/dataset_selector.py similarity index 100% rename from tpot/builtins/DatasetSelector.py rename to tpot/builtins/dataset_selector.py From e03654a5976287a07567d275c053b400dafe9751 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Thu, 19 Apr 2018 12:28:38 -0400 Subject: [PATCH 09/60] refine dataset selector --- tpot/builtins/__init__.py | 1 + tpot/builtins/dataset_selector.py | 68 ++++++++++++++++--------------- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/tpot/builtins/__init__.py b/tpot/builtins/__init__.py index e0ca35d7..48a76b8e 100644 --- a/tpot/builtins/__init__.py +++ b/tpot/builtins/__init__.py @@ -27,3 +27,4 @@ from .combine_dfs import CombineDFs from .stacking_estimator import StackingEstimator from .one_hot_encoder import OneHotEncoder +from .dataset_selector import DatasetSelector diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index 2c6f0105..8873ee36 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -9,12 +9,13 @@ import pandas as pd import os, os.path from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_selection.base import SelectorMixin -class DatasetSelector(BaseEstimator): +class DatasetSelector(BaseEstimator, TransformerMixin, SelectorMixin): """Select predefined data subsets.""" - def __init__(self, subset_dir=None): + def __init__(self, subset_dir=None, sel_subset_idx=0): """Create a DatasetSelector object. Parameters @@ -24,6 +25,8 @@ def __init__(self, subset_dir=None): each file needs to be a .csv with one header row. The feature names in these files must match those in the (training and testing) dataset. + sel_subset_idx: int, required + Index of subset Returns ------- @@ -31,45 +34,46 @@ def __init__(self, subset_dir=None): """ self.subset_dir = subset_dir + self.sel_subset_idx = sel_subset_idx - def get_subset(self, input_data, input_target): - """Fit an optimized machine learning pipeline using TPOT. - - Uses genetic programming to optimize a machine learning pipeline that - maximizes score on the provided features and target. Performs internal - k-fold cross-validaton to avoid overfitting on the provided data. The - best pipeline is then trained on the entire set of provided samples. + def fit(self, X, y=None): + """Fit DatasetSelector for feature selection Parameters ---------- - input_data: array-like {n_samples, n_features} - Feature matrix - - input_target: array-like {n_samples} - List of class labels for prediction + X: array-like of shape (n_samples, n_features) + The training input samples. + y: array-like, shape (n_samples,) + The target values (integers that correspond to classes in classification, real numbers in regression). Returns ------- - self.data_subset: object - Returns a list of subsets of input_data - + self: object + Returns a copy of the estimator """ - self.input_data = input_data - self.input_target = input_target - self.feature_names = list(self.input_data.columns.values) + self.feature_names = list(X.columns.values) + subset_files = os.listdir(self.subset_dir) + self.subset_i = self.subset_dir + "/" + subset_files[self.sel_subset_idx] + self.features_i_df = pd.read_csv(self.subset_i, sep='\t', header=0) + feature_i = set(features_i_df.values.flatten()) + self.feat_list = list(feature_i.intersection(set(self.feature_names))) + + return self + + def transform(self, X): + """Make subset after fit - self.subset_files = os.listdir(self.subset_dir) - self.num_subset = len(self.subset_files) - self.feature_set = {} - self.data_subset = {} - self.population_size = population_size + Parameters + ---------- + X: numpy ndarray, {n_samples, n_features} + New data, where n_samples is the number of samples and n_features is the number of features. - for i in range(self.num_subset): - self.subset_i = self.subset_dir + "/" + self.subset_files[i] - self.features_i_df = pd.read_csv(self.subset_i, sep='\t', header=0) - self.feature_i = set(features_i_df.values.flatten()) - self.feature_set[i] = list(feature_i.intersection(set(self.feature_names))) - self.data_subset[i] = self.input_data[self.feature_set[i]] + Returns + ------- + X_transformed: array-like, shape (n_samples, n_features + 1) or (n_samples, n_features + 1 + n_classes) for classifier with predict_proba attribute + The transformed feature set. + """ + X_transformed = X[self.feat_list].values - return self.data_subset + return X_transformed From 92f0b4f4943b953b4b57d8f1d851e6b8e63441e7 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Thu, 19 Apr 2018 13:10:26 -0400 Subject: [PATCH 10/60] bug fix --- tests/dataset_selector_tests.py | 54 +++++++++++++++++++++++++ tests/test_subset_dir/test_subset_1.snp | 4 ++ tests/test_subset_dir/test_subset_2.snp | 5 +++ tpot/builtins/dataset_selector.py | 9 ++--- 4 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 tests/dataset_selector_tests.py create mode 100644 tests/test_subset_dir/test_subset_1.snp create mode 100644 tests/test_subset_dir/test_subset_2.snp diff --git a/tests/dataset_selector_tests.py b/tests/dataset_selector_tests.py new file mode 100644 index 00000000..cec34a22 --- /dev/null +++ b/tests/dataset_selector_tests.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +"""This file is part of the TPOT library. + +TPOT was primarily developed at the University of Pennsylvania by: + - Randal S. Olson (rso@randalolson.com) + - Weixuan Fu (weixuanf@upenn.edu) + - Daniel Angell (dpa34@drexel.edu) + - and many more generous open source contributors + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +import numpy as np +import pandas as pd +from tpot.builtins import DatasetSelector + +test_data = pd.read_csv("tests/tests.csv") +test_X = test_data.drop("class", axis=1) + + +def test_DatasetSelector_1(): + """Assert that the StackingEstimator returns transformed X based on test feature list 1.""" + ds = DatasetSelector(subset_dir="tests/test_subset_dir", sel_subset_idx=0) + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 3 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_DatasetSelector_2(): + """Assert that the StackingEstimator returns transformed X based on test feature list 2.""" + ds = DatasetSelector(subset_dir="tests/test_subset_dir", sel_subset_idx=1) + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 4 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) diff --git a/tests/test_subset_dir/test_subset_1.snp b/tests/test_subset_dir/test_subset_1.snp new file mode 100644 index 00000000..b62a26de --- /dev/null +++ b/tests/test_subset_dir/test_subset_1.snp @@ -0,0 +1,4 @@ +test_list_1 +2 +4 +9 diff --git a/tests/test_subset_dir/test_subset_2.snp b/tests/test_subset_dir/test_subset_2.snp new file mode 100644 index 00000000..59c7be3b --- /dev/null +++ b/tests/test_subset_dir/test_subset_2.snp @@ -0,0 +1,5 @@ +test_list_2 +5 +7 +8 +15 diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index 8873ee36..b031d423 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -9,10 +9,9 @@ import pandas as pd import os, os.path from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.feature_selection.base import SelectorMixin -class DatasetSelector(BaseEstimator, TransformerMixin, SelectorMixin): +class DatasetSelector(BaseEstimator, TransformerMixin): """Select predefined data subsets.""" def __init__(self, subset_dir=None, sel_subset_idx=0): @@ -55,9 +54,9 @@ def fit(self, X, y=None): self.feature_names = list(X.columns.values) subset_files = os.listdir(self.subset_dir) self.subset_i = self.subset_dir + "/" + subset_files[self.sel_subset_idx] - self.features_i_df = pd.read_csv(self.subset_i, sep='\t', header=0) - feature_i = set(features_i_df.values.flatten()) - self.feat_list = list(feature_i.intersection(set(self.feature_names))) + features_i_df = pd.read_csv(self.subset_i, sep='\t', header=0) + feature_i = [str(val) for val in features_i_df.values.flatten()] + self.feat_list = list(set(feature_i).intersection(set(self.feature_names))) return self From 50463880f3912683644f8ff641b597e2c82654d4 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Thu, 26 Apr 2018 15:41:04 -0400 Subject: [PATCH 11/60] add test codes and fix dataset check in the fit --- test_tpot_datasel.py | 32 ++++++++++++++++++++++++++++++++ tpot/base.py | 15 ++++++++++----- 2 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 test_tpot_datasel.py diff --git a/test_tpot_datasel.py b/test_tpot_datasel.py new file mode 100644 index 00000000..d8332782 --- /dev/null +++ b/test_tpot_datasel.py @@ -0,0 +1,32 @@ +# coding: utf-8 +from tpot import TPOTClassifier +from sklearn.datasets import load_digits +from sklearn.model_selection import train_test_split +from tpot.config import classifier_config_dict +import pandas as pd +import numpy as np + +personal_config = classifier_config_dict +personal_config['tpot.builtins.DatasetSelector'] = { + 'subset_dir': ['./tests/test_subset_dir/'], + 'sel_subset_idx': range(0, 1) +} +# print(personal_config) + +tpot_data = pd.read_csv( + './tests/tests.csv') +Xdata = tpot_data.loc[:, tpot_data.columns != 'class'] +Ydata = tpot_data[['class']] + +X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, + train_size=0.75, test_size=0.25) +# X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, +# train_size=0.75, test_size=0.25) + + +tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, + config_dict=personal_config, + template='DatasetSelector-Transformer-Classifier') +tpot.fit(X_train, y_train) +print(tpot.score(X_test, y_test)) + diff --git a/tpot/base.py b/tpot/base.py index ef69b749..13db1f76 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -469,6 +469,7 @@ def _setup_pset(self): def _add_operators(self): main_type = ["Classifier", "Regressor", "Selector", "Transformer"] ret_types = [] + op_list = [] if self.template: steps = self.template.split('-') for idx, step in enumerate(steps): @@ -490,16 +491,20 @@ def _add_operators(self): if operator.type() == step: p_types = ([step_in_type] + arg_types, step_ret_type) self._pset.addPrimitive(operator, *p_types) - self._import_hash(operator) - self._add_terminals(arg_types) + if not op_list.count(operator.__name__): + self._import_hash(operator) + self._add_terminals(arg_types) + op_list.append(operator.__name__) else: # is the step is a specific operator for operator in self.operators: arg_types = operator.parameter_types()[0][1:] if operator.__name__ == step: p_types = ([step_in_type] + arg_types, step_ret_type) self._pset.addPrimitive(operator, *p_types) - self._import_hash(operator) - self._add_terminals(arg_types) + if not op_list.count(operator.__name__): + self._import_hash(operator) + self._add_terminals(arg_types) + op_list.append(operator.__name__) else: # no template and randomly generated pipeline for operator in self.operators: arg_types = operator.parameter_types()[0][1:] @@ -600,7 +605,7 @@ def fit(self, features, target, sample_weight=None, groups=None): """ - features, target = self._check_dataset(features, target) + self._check_dataset(features, target) # Randomly collect a subsample of training samples for pipeline optimization process. if self.subsample < 1.0: From c4a32e2fba0ccbecabd554d16a46935525a59794 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Thu, 26 Apr 2018 16:19:23 -0400 Subject: [PATCH 12/60] fix a bug --- test_tpot_datasel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test_tpot_datasel.py b/test_tpot_datasel.py index d8332782..b26efb98 100644 --- a/test_tpot_datasel.py +++ b/test_tpot_datasel.py @@ -16,7 +16,7 @@ tpot_data = pd.read_csv( './tests/tests.csv') Xdata = tpot_data.loc[:, tpot_data.columns != 'class'] -Ydata = tpot_data[['class']] +Ydata = tpot_data['class'] X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, train_size=0.75, test_size=0.25) @@ -29,4 +29,3 @@ template='DatasetSelector-Transformer-Classifier') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) - From e311ac254a9e8661db7f4b9006e8d84d86f4d1c2 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 30 Apr 2018 11:38:50 -0400 Subject: [PATCH 13/60] use fname instead of index --- test_tpot_datasel.py | 7 ++++--- tests/dataset_selector_tests.py | 6 +++--- tests/test_subset_dir/test_subset_2.snp | 6 +++--- tpot/builtins/dataset_selector.py | 25 +++++++++++++++---------- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/test_tpot_datasel.py b/test_tpot_datasel.py index b26efb98..289a7602 100644 --- a/test_tpot_datasel.py +++ b/test_tpot_datasel.py @@ -9,7 +9,7 @@ personal_config = classifier_config_dict personal_config['tpot.builtins.DatasetSelector'] = { 'subset_dir': ['./tests/test_subset_dir/'], - 'sel_subset_idx': range(0, 1) + 'sel_subset_fname': ['test_subset_1.snp', 'test_subset_2.snp'] } # print(personal_config) @@ -24,8 +24,9 @@ # train_size=0.75, test_size=0.25) -tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, +tpot = TPOTClassifier(generations=5, population_size=20, verbosity=3, config_dict=personal_config, - template='DatasetSelector-Transformer-Classifier') + template='DatasetSelector-Transformer-Classifier', + random_state=42) tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) diff --git a/tests/dataset_selector_tests.py b/tests/dataset_selector_tests.py index cec34a22..969dbbbe 100644 --- a/tests/dataset_selector_tests.py +++ b/tests/dataset_selector_tests.py @@ -33,7 +33,7 @@ def test_DatasetSelector_1(): """Assert that the StackingEstimator returns transformed X based on test feature list 1.""" - ds = DatasetSelector(subset_dir="tests/test_subset_dir", sel_subset_idx=0) + ds = DatasetSelector(subset_dir="tests/test_subset_dir", sel_subset_fname="test_subset_1.snp") ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) @@ -44,10 +44,10 @@ def test_DatasetSelector_1(): def test_DatasetSelector_2(): """Assert that the StackingEstimator returns transformed X based on test feature list 2.""" - ds = DatasetSelector(subset_dir="tests/test_subset_dir", sel_subset_idx=1) + ds = DatasetSelector(subset_dir="tests/test_subset_dir", sel_subset_fname="test_subset_2.snp") ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) - + assert transformed_X.shape[0] == test_X.shape[0] assert transformed_X.shape[1] != test_X.shape[1] assert transformed_X.shape[1] == 4 diff --git a/tests/test_subset_dir/test_subset_2.snp b/tests/test_subset_dir/test_subset_2.snp index 59c7be3b..5310c4ba 100644 --- a/tests/test_subset_dir/test_subset_2.snp +++ b/tests/test_subset_dir/test_subset_2.snp @@ -1,5 +1,5 @@ test_list_2 +2 +3 +4 5 -7 -8 -15 diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index b031d423..a649a701 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -14,7 +14,7 @@ class DatasetSelector(BaseEstimator, TransformerMixin): """Select predefined data subsets.""" - def __init__(self, subset_dir=None, sel_subset_idx=0): + def __init__(self, subset_dir, sel_subset_fname): """Create a DatasetSelector object. Parameters @@ -24,8 +24,8 @@ def __init__(self, subset_dir=None, sel_subset_idx=0): each file needs to be a .csv with one header row. The feature names in these files must match those in the (training and testing) dataset. - sel_subset_idx: int, required - Index of subset + sel_subset_fname: string, required + File name of subset Returns ------- @@ -33,7 +33,7 @@ def __init__(self, subset_dir=None, sel_subset_idx=0): """ self.subset_dir = subset_dir - self.sel_subset_idx = sel_subset_idx + self.sel_subset_fname = sel_subset_fname def fit(self, X, y=None): """Fit DatasetSelector for feature selection @@ -50,14 +50,16 @@ def fit(self, X, y=None): self: object Returns a copy of the estimator """ - - self.feature_names = list(X.columns.values) subset_files = os.listdir(self.subset_dir) - self.subset_i = self.subset_dir + "/" + subset_files[self.sel_subset_idx] + self.subset_i = self.subset_dir + "/" + self.sel_subset_fname features_i_df = pd.read_csv(self.subset_i, sep='\t', header=0) - feature_i = [str(val) for val in features_i_df.values.flatten()] + if isinstance(X, pd.DataFrame): # use columns' names + self.feature_names = list(X.columns.values) + feature_i = [str(val) for val in features_i_df.values.flatten()] + elif isinstance(X, np.ndarray): # use index + self.feature_names = list(range(X.shape[1])) + feature_i = [int(val) for val in features_i_df.values.flatten()] self.feat_list = list(set(feature_i).intersection(set(self.feature_names))) - return self def transform(self, X): @@ -73,6 +75,9 @@ def transform(self, X): X_transformed: array-like, shape (n_samples, n_features + 1) or (n_samples, n_features + 1 + n_classes) for classifier with predict_proba attribute The transformed feature set. """ - X_transformed = X[self.feat_list].values + if isinstance(X, pd.DataFrame): + X_transformed = X[self.feat_list].values + elif isinstance(X, np.ndarray): + X_transformed = X[:, self.feat_list] return X_transformed From 5fde220a4e5e48332e3a781abd045984f1e3d954 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 30 Apr 2018 12:23:40 -0400 Subject: [PATCH 14/60] better unit tests --- test_tpot_datasel.py | 4 ++-- tests/dataset_selector_tests.py | 6 +++--- tests/test_subset_dir/test_subset_1.snp | 2 ++ tests/test_subset_dir/test_subset_2.snp | 2 ++ 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/test_tpot_datasel.py b/test_tpot_datasel.py index 289a7602..e741e986 100644 --- a/test_tpot_datasel.py +++ b/test_tpot_datasel.py @@ -26,7 +26,7 @@ tpot = TPOTClassifier(generations=5, population_size=20, verbosity=3, config_dict=personal_config, - template='DatasetSelector-Transformer-Classifier', + template='DatasetSelector-Classifier', random_state=42) tpot.fit(X_train, y_train) -print(tpot.score(X_test, y_test)) +print('Holdout Score',tpot.score(X_test, y_test)) diff --git a/tests/dataset_selector_tests.py b/tests/dataset_selector_tests.py index 969dbbbe..3cb1cb8f 100644 --- a/tests/dataset_selector_tests.py +++ b/tests/dataset_selector_tests.py @@ -39,7 +39,7 @@ def test_DatasetSelector_1(): assert transformed_X.shape[0] == test_X.shape[0] assert transformed_X.shape[1] != test_X.shape[1] - assert transformed_X.shape[1] == 3 + assert transformed_X.shape[1] == 5 assert np.array_equal(transformed_X, test_X[ds.feat_list].values) def test_DatasetSelector_2(): @@ -47,8 +47,8 @@ def test_DatasetSelector_2(): ds = DatasetSelector(subset_dir="tests/test_subset_dir", sel_subset_fname="test_subset_2.snp") ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) - + assert transformed_X.shape[0] == test_X.shape[0] assert transformed_X.shape[1] != test_X.shape[1] - assert transformed_X.shape[1] == 4 + assert transformed_X.shape[1] == 6 assert np.array_equal(transformed_X, test_X[ds.feat_list].values) diff --git a/tests/test_subset_dir/test_subset_1.snp b/tests/test_subset_dir/test_subset_1.snp index b62a26de..ff28bcf0 100644 --- a/tests/test_subset_dir/test_subset_1.snp +++ b/tests/test_subset_dir/test_subset_1.snp @@ -2,3 +2,5 @@ test_list_1 2 4 9 +11 +14 diff --git a/tests/test_subset_dir/test_subset_2.snp b/tests/test_subset_dir/test_subset_2.snp index 5310c4ba..0b28d022 100644 --- a/tests/test_subset_dir/test_subset_2.snp +++ b/tests/test_subset_dir/test_subset_2.snp @@ -3,3 +3,5 @@ test_list_2 3 4 5 +9 +11 From 04abe1016398ec0cfcf4af4fa8dcf09f4ad0e137 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 30 Apr 2018 12:53:10 -0400 Subject: [PATCH 15/60] better per test --- test_tpot_datasel.py | 18 +++++++++++++++++- tpot/decorators.py | 4 ++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/test_tpot_datasel.py b/test_tpot_datasel.py index e741e986..d95350cd 100644 --- a/test_tpot_datasel.py +++ b/test_tpot_datasel.py @@ -1,8 +1,11 @@ # coding: utf-8 from tpot import TPOTClassifier from sklearn.datasets import load_digits -from sklearn.model_selection import train_test_split +from sklearn.model_selection import train_test_split, cross_val_score +from sklearn.pipeline import make_pipeline from tpot.config import classifier_config_dict +from tpot.builtins import DatasetSelector +from sklearn.neighbors import KNeighborsClassifier import pandas as pd import numpy as np @@ -24,6 +27,19 @@ # train_size=0.75, test_size=0.25) +clf = make_pipeline( + DatasetSelector(sel_subset_fname="test_subset_2.snp", subset_dir="./tests/test_subset_dir/"), + KNeighborsClassifier(n_neighbors=74, p=1, weights="uniform") +) +for _ in range(1): + try: + cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy', verbose=0) + print('CV Score',cv_scores) + except Exception as e: + print(_,'# WARNING: ') + print(e) + + tpot = TPOTClassifier(generations=5, population_size=20, verbosity=3, config_dict=personal_config, template='DatasetSelector-Classifier', diff --git a/tpot/decorators.py b/tpot/decorators.py index b9ea97bf..6a917169 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -34,8 +34,8 @@ # generate a small data set for a new pipeline, in order to check if the pipeline # has unsuppported combinations in params -pretest_X, pretest_y = make_classification(n_samples=50, n_features=10, random_state=42) -pretest_X_reg, pretest_y_reg = make_regression(n_samples=50, n_features=10, random_state=42) +pretest_X, pretest_y = make_classification(n_samples=300, n_features=10, random_state=42) +pretest_X_reg, pretest_y_reg = make_regression(n_samples=300, n_features=10, random_state=42) def _pre_test(func): From f9ad82ebf6361dbe45c166896215a9303bbfedfd Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 30 Apr 2018 12:54:21 -0400 Subject: [PATCH 16/60] clean codes --- test_tpot_datasel.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/test_tpot_datasel.py b/test_tpot_datasel.py index d95350cd..323d091d 100644 --- a/test_tpot_datasel.py +++ b/test_tpot_datasel.py @@ -23,21 +23,6 @@ X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, train_size=0.75, test_size=0.25) -# X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, -# train_size=0.75, test_size=0.25) - - -clf = make_pipeline( - DatasetSelector(sel_subset_fname="test_subset_2.snp", subset_dir="./tests/test_subset_dir/"), - KNeighborsClassifier(n_neighbors=74, p=1, weights="uniform") -) -for _ in range(1): - try: - cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy', verbose=0) - print('CV Score',cv_scores) - except Exception as e: - print(_,'# WARNING: ') - print(e) tpot = TPOTClassifier(generations=5, population_size=20, verbosity=3, From 109812e05078c281b457d22d2b474f491916a820 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 30 Apr 2018 17:03:13 -0400 Subject: [PATCH 17/60] fix type unit tests --- tests/tpot_tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index f890d50d..601ebb08 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -1653,6 +1653,7 @@ def test_PolynomialFeatures_exception(): initialize_stats_dict(pipeline) fitness_scores = tpot_obj._evaluate_individuals(pipelines, pretest_X, pretest_y) + known_scores = [(2, 0.94000000000000006), (5000.0, -float('inf'))] assert np.allclose(known_scores, fitness_scores) @@ -1939,7 +1940,7 @@ def test_varOr_3(): def test_operator_type(): """Assert that TPOT operators return their type, e.g. 'Classifier', 'Preprocessor'.""" - assert TPOTSelectPercentile.type() == "Preprocessor or Selector" + assert TPOTSelectPercentile.type() == "Selector" def test_gen(): From ed1cb15cbf9b00c710138274bf72cf7667497d6f Mon Sep 17 00:00:00 2001 From: Riley Wong Date: Thu, 26 Apr 2018 19:44:47 -0400 Subject: [PATCH 18/60] Add first set of hyperparams --- tpot/config/classifier_nn.py | 220 +++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 tpot/config/classifier_nn.py diff --git a/tpot/config/classifier_nn.py b/tpot/config/classifier_nn.py new file mode 100644 index 00000000..38bddec5 --- /dev/null +++ b/tpot/config/classifier_nn.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- + +"""This file is part of the TPOT library. + +TPOT was primarily developed at the University of Pennsylvania by: + - Randal S. Olson (rso@randalolson.com) + - Weixuan Fu (weixuanf@upenn.edu) + - Daniel Angell (dpa34@drexel.edu) + - and many more generous open source contributors + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +import numpy as np + +# Check the TPOT documentation for information on the structure of config dicts + +classifier_config_dict = { + # MLPClassifier for neural networks + # TODO: revisit/tweak: alpha, momentum, learning rate_init + 'sklearn.neural_network.MLPClassifier': { + 'activation': ['identity', 'logistic', 'tanh', 'relu'], + 'solver': ['lbfgs', 'sgd', 'adam'], + 'learning_rate': ['constant', 'invscaling', 'adaptive'], + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'learning_rate_init': [1e-3, 1e-2, 1e-1, 0.5, 0.75, 0.9], + 'momentum': [0.1, 0.5, 0.75, 0.9] + }, + + # Classifiers + 'sklearn.naive_bayes.GaussianNB': { + }, + + 'sklearn.naive_bayes.BernoulliNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'fit_prior': [True, False] + }, + + 'sklearn.naive_bayes.MultinomialNB': { + 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.], + 'fit_prior': [True, False] + }, + + 'sklearn.tree.DecisionTreeClassifier': { + 'criterion': ["gini", "entropy"], + 'max_depth': range(1, 11), + 'min_samples_split': range(2, 21), + 'min_samples_leaf': range(1, 21) + }, + + 'sklearn.ensemble.ExtraTreesClassifier': { + 'n_estimators': [100], + 'criterion': ["gini", "entropy"], + 'max_features': np.arange(0.05, 1.01, 0.05), + 'min_samples_split': range(2, 21), + 'min_samples_leaf': range(1, 21), + 'bootstrap': [True, False] + }, + + 'sklearn.ensemble.RandomForestClassifier': { + 'n_estimators': [100], + 'criterion': ["gini", "entropy"], + 'max_features': np.arange(0.05, 1.01, 0.05), + 'min_samples_split': range(2, 21), + 'min_samples_leaf': range(1, 21), + 'bootstrap': [True, False] + }, + + 'sklearn.ensemble.GradientBoostingClassifier': { + 'n_estimators': [100], + 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], + 'max_depth': range(1, 11), + 'min_samples_split': range(2, 21), + 'min_samples_leaf': range(1, 21), + 'subsample': np.arange(0.05, 1.01, 0.05), + 'max_features': np.arange(0.05, 1.01, 0.05) + }, + + 'sklearn.neighbors.KNeighborsClassifier': { + 'n_neighbors': range(1, 101), + 'weights': ["uniform", "distance"], + 'p': [1, 2] + }, + + 'sklearn.svm.LinearSVC': { + 'penalty': ["l1", "l2"], + 'loss': ["hinge", "squared_hinge"], + 'dual': [True, False], + 'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.] + }, + + 'sklearn.linear_model.LogisticRegression': { + 'penalty': ["l1", "l2"], + 'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.], + 'dual': [True, False] + }, + + 'xgboost.XGBClassifier': { + 'n_estimators': [100], + 'max_depth': range(1, 11), + 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], + 'subsample': np.arange(0.05, 1.01, 0.05), + 'min_child_weight': range(1, 21), + 'nthread': [1] + }, + + # Preprocesssors + 'sklearn.preprocessing.Binarizer': { + 'threshold': np.arange(0.0, 1.01, 0.05) + }, + + 'sklearn.decomposition.FastICA': { + 'tol': np.arange(0.0, 1.01, 0.05) + }, + + 'sklearn.cluster.FeatureAgglomeration': { + 'linkage': ['ward', 'complete', 'average'], + 'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine'] + }, + + 'sklearn.preprocessing.MaxAbsScaler': { + }, + + 'sklearn.preprocessing.MinMaxScaler': { + }, + + 'sklearn.preprocessing.Normalizer': { + 'norm': ['l1', 'l2', 'max'] + }, + + 'sklearn.kernel_approximation.Nystroem': { + 'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'], + 'gamma': np.arange(0.0, 1.01, 0.05), + 'n_components': range(1, 11) + }, + + 'sklearn.decomposition.PCA': { + 'svd_solver': ['randomized'], + 'iterated_power': range(1, 11) + }, + + 'sklearn.preprocessing.PolynomialFeatures': { + 'degree': [2], + 'include_bias': [False], + 'interaction_only': [False] + }, + + 'sklearn.kernel_approximation.RBFSampler': { + 'gamma': np.arange(0.0, 1.01, 0.05) + }, + + 'sklearn.preprocessing.RobustScaler': { + }, + + 'sklearn.preprocessing.StandardScaler': { + }, + + 'tpot.builtins.ZeroCount': { + }, + + 'tpot.builtins.OneHotEncoder': { + 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25], + 'sparse': [False] + }, + + # Selectors + 'sklearn.feature_selection.SelectFwe': { + 'alpha': np.arange(0, 0.05, 0.001), + 'score_func': { + 'sklearn.feature_selection.f_classif': None + } + }, + + 'sklearn.feature_selection.SelectPercentile': { + 'percentile': range(1, 100), + 'score_func': { + 'sklearn.feature_selection.f_classif': None + } + }, + + 'sklearn.feature_selection.VarianceThreshold': { + 'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2] + }, + + 'sklearn.feature_selection.RFE': { + 'step': np.arange(0.05, 1.01, 0.05), + 'estimator': { + 'sklearn.ensemble.ExtraTreesClassifier': { + 'n_estimators': [100], + 'criterion': ['gini', 'entropy'], + 'max_features': np.arange(0.05, 1.01, 0.05) + } + } + }, + + 'sklearn.feature_selection.SelectFromModel': { + 'threshold': np.arange(0, 1.01, 0.05), + 'estimator': { + 'sklearn.ensemble.ExtraTreesClassifier': { + 'n_estimators': [100], + 'criterion': ['gini', 'entropy'], + 'max_features': np.arange(0.05, 1.01, 0.05) + } + } + } + +} From 843cbc8582b316f6edc19f721a077e7f05e9a85e Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 30 Apr 2018 17:19:50 -0400 Subject: [PATCH 19/60] fix most unit tests 1 left related to stats --- tests/stats_test.py | 2 +- tests/tpot_tests.py | 6 +++--- tpot/base.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/stats_test.py b/tests/stats_test.py index 278a84f7..8c8b0e1c 100644 --- a/tests/stats_test.py +++ b/tests/stats_test.py @@ -119,7 +119,7 @@ def test_mut_operator_stats_update(): for _ in range(10): offspring, = tpot_obj._random_mutation_operator(ind) - + print(offspring.statistics['mutation_count'], ind.statistics['mutation_count']) assert offspring.statistics['crossover_count'] == ind.statistics['crossover_count'] assert offspring.statistics['mutation_count'] == ind.statistics['mutation_count'] + 1 assert offspring.statistics['predecessor'] == (str(ind),) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index 601ebb08..f0f01d79 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -1618,7 +1618,7 @@ def test_tpot_operator_factory_class(): assert len(tpot_argument_list) == 9 assert tpot_operator_list[0].root is True assert tpot_operator_list[1].root is False - assert tpot_operator_list[2].type() == "Classifier or Regressor" + assert tpot_operator_list[2].type() == "Classifier" assert tpot_argument_list[1].values == [True, False] @@ -1653,9 +1653,9 @@ def test_PolynomialFeatures_exception(): initialize_stats_dict(pipeline) fitness_scores = tpot_obj._evaluate_individuals(pipelines, pretest_X, pretest_y) - known_scores = [(2, 0.94000000000000006), (5000.0, -float('inf'))] - assert np.allclose(known_scores, fitness_scores) + assert fitness_scores[0][0] == 2 + assert fitness_scores[1][0] == 5000.0 def test_pick_two_individuals_eligible_for_crossover(): diff --git a/tpot/base.py b/tpot/base.py index 13db1f76..27a8c1b6 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -605,7 +605,7 @@ def fit(self, features, target, sample_weight=None, groups=None): """ - self._check_dataset(features, target) + features, target = self._check_dataset(features, target) # Randomly collect a subsample of training samples for pipeline optimization process. if self.subsample < 1.0: @@ -1089,7 +1089,7 @@ def _check_dataset(self, features, target): self._imputed = True features = self._impute_values(features) try: - X, y = check_X_y(features, target, accept_sparse=True, dtype=np.float64) + X, y = check_X_y(features, target, accept_sparse=True, dtype=None) return X, y except (AssertionError, ValueError): raise ValueError( From 39709b67095083c10b4782cff2978d7efd6ccf19 Mon Sep 17 00:00:00 2001 From: Riley Wong Date: Mon, 30 Apr 2018 17:46:06 -0400 Subject: [PATCH 20/60] Rename nn config dict --- tpot/config/classifier_nn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/config/classifier_nn.py b/tpot/config/classifier_nn.py index 38bddec5..6d3e5944 100644 --- a/tpot/config/classifier_nn.py +++ b/tpot/config/classifier_nn.py @@ -27,7 +27,7 @@ # Check the TPOT documentation for information on the structure of config dicts -classifier_config_dict = { +classifier_config_nn = { # MLPClassifier for neural networks # TODO: revisit/tweak: alpha, momentum, learning rate_init 'sklearn.neural_network.MLPClassifier': { From db54137efb8d92d328f7419bd435ff18c02448f2 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Tue, 1 May 2018 11:33:10 -0400 Subject: [PATCH 21/60] fix bug in expr_mut --- tpot/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tpot/base.py b/tpot/base.py index 27a8c1b6..2fba535e 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -566,7 +566,6 @@ def _setup_toolbox(self): self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max) else: self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max + 1) - self._toolbox.register('expr_mut', self._gen_grow_safe, min_=1, max_=4) self._toolbox.register('mutate', self._random_mutation_operator) def fit(self, features, target, sample_weight=None, groups=None): From b193abfcd8c5ca9a033f799945e6f83cee8210f5 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Tue, 1 May 2018 11:36:42 -0400 Subject: [PATCH 22/60] fix unit tests --- tests/stats_test.py | 2 +- tpot/base.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/stats_test.py b/tests/stats_test.py index 8c8b0e1c..4148ee3f 100644 --- a/tests/stats_test.py +++ b/tests/stats_test.py @@ -119,7 +119,7 @@ def test_mut_operator_stats_update(): for _ in range(10): offspring, = tpot_obj._random_mutation_operator(ind) - print(offspring.statistics['mutation_count'], ind.statistics['mutation_count']) + assert offspring.statistics['crossover_count'] == ind.statistics['crossover_count'] assert offspring.statistics['mutation_count'] == ind.statistics['mutation_count'] + 1 assert offspring.statistics['predecessor'] == (str(ind),) diff --git a/tpot/base.py b/tpot/base.py index 2fba535e..a88ecfaa 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -1435,6 +1435,15 @@ def _random_mutation_operator(self, individual, allow_shrink=True): ind = self._toolbox.clone(individual) offspring, = mutator(ind) if str(offspring) not in self.evaluated_individuals_: + # Update statistics + # crossover_count is kept the same as for the predecessor + # mutation count is increased by 1 + # predecessor is set to the string representation of the individual before mutation + # generation is set to 'INVALID' such that we can recognize that it should be updated accordingly + offspring.statistics['crossover_count'] = individual.statistics['crossover_count'] + offspring.statistics['mutation_count'] = individual.statistics['mutation_count'] + 1 + offspring.statistics['predecessor'] = (str(individual),) + offspring.statistics['generation'] = 'INVALID' break else: unsuccesful_mutations += 1 From 61dfd56f22de808046a23bea23e1079df459daef Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Tue, 1 May 2018 11:37:28 -0400 Subject: [PATCH 23/60] remove example --- test_tpot_datasel.py | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 test_tpot_datasel.py diff --git a/test_tpot_datasel.py b/test_tpot_datasel.py deleted file mode 100644 index 323d091d..00000000 --- a/test_tpot_datasel.py +++ /dev/null @@ -1,33 +0,0 @@ -# coding: utf-8 -from tpot import TPOTClassifier -from sklearn.datasets import load_digits -from sklearn.model_selection import train_test_split, cross_val_score -from sklearn.pipeline import make_pipeline -from tpot.config import classifier_config_dict -from tpot.builtins import DatasetSelector -from sklearn.neighbors import KNeighborsClassifier -import pandas as pd -import numpy as np - -personal_config = classifier_config_dict -personal_config['tpot.builtins.DatasetSelector'] = { - 'subset_dir': ['./tests/test_subset_dir/'], - 'sel_subset_fname': ['test_subset_1.snp', 'test_subset_2.snp'] -} -# print(personal_config) - -tpot_data = pd.read_csv( - './tests/tests.csv') -Xdata = tpot_data.loc[:, tpot_data.columns != 'class'] -Ydata = tpot_data['class'] - -X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, - train_size=0.75, test_size=0.25) - - -tpot = TPOTClassifier(generations=5, population_size=20, verbosity=3, - config_dict=personal_config, - template='DatasetSelector-Classifier', - random_state=42) -tpot.fit(X_train, y_train) -print('Holdout Score',tpot.score(X_test, y_test)) From 06dc7e324c26c939a7138341fb132a245b8b7853 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Tue, 1 May 2018 12:03:35 -0400 Subject: [PATCH 24/60] fix all unit tests --- tests/tpot_tests.py | 2 +- tpot/decorators.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index f0f01d79..27f327b7 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -1653,7 +1653,7 @@ def test_PolynomialFeatures_exception(): initialize_stats_dict(pipeline) fitness_scores = tpot_obj._evaluate_individuals(pipelines, pretest_X, pretest_y) - known_scores = [(2, 0.94000000000000006), (5000.0, -float('inf'))] + assert fitness_scores[0][0] == 2 assert fitness_scores[1][0] == 5000.0 diff --git a/tpot/decorators.py b/tpot/decorators.py index 6a917169..e8b6a17e 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -34,8 +34,8 @@ # generate a small data set for a new pipeline, in order to check if the pipeline # has unsuppported combinations in params -pretest_X, pretest_y = make_classification(n_samples=300, n_features=10, random_state=42) -pretest_X_reg, pretest_y_reg = make_regression(n_samples=300, n_features=10, random_state=42) +pretest_X, pretest_y = make_classification(n_samples=100, n_features=10, random_state=42) +pretest_X_reg, pretest_y_reg = make_regression(n_samples=100, n_features=10, random_state=42) def _pre_test(func): From fce00510e84d0ebadf7af5d91c20d5a525c933cd Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 7 May 2018 11:18:11 -0400 Subject: [PATCH 25/60] add tree structure --- tpot/base.py | 86 +++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index a88ecfaa..53ad0dac 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -200,7 +200,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, String 'TPOT sparse': TPOT uses a configuration dictionary with a one-hot-encoder and the operators normally included in TPOT that also support sparse matrices. - template: Python list (default: None) + template: string (default: None) A template for pipeline structure warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from @@ -277,6 +277,8 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, # any one given individual (or pair of individuals) self._max_mut_loops = 50 + self._setup_template(template) + # Set offspring_size equal to population_size by default if offspring_size: self.offspring_size = offspring_size @@ -286,18 +288,6 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self.config_dict_params = config_dict self._setup_config(self.config_dict_params) - self.template = template - if template: - self.fixed_length = len(template.split('-')) - # for now, the template is only a linear pipeline - # will add supports for more complex structure - self._min = self.fixed_length - self._max = self.fixed_length + 1 - else: - self.fixed_length = 0 - self._min = 1 - self._max = 3 - self.operators = [] for key in sorted(self.config_dict.keys()): @@ -358,6 +348,28 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self._setup_pset() self._setup_toolbox() + def _setup_template(self, template): + if tempfile: + self.template = template + else: + if self.classification: + self.template = 'RandomTree-Classifier' + else: + self.template = 'RandomTree-Regressor' + self.template_comp = self.template.split('-') + self._min = -1 + self._max = 0 + for comp in self.template_comp: + self._min += 1 + if comp == 'RandomTree': + self._min += 2 + else: + self._max += 1 + if self._max - self._min == 1: + self.tree_structure = False + else: + self.tree_structure = True + def _setup_scoring_function(self, scoring): if scoring: @@ -471,9 +483,8 @@ def _add_operators(self): ret_types = [] op_list = [] if self.template: - steps = self.template.split('-') - for idx, step in enumerate(steps): - if idx < self.fixed_length - 1: + for idx, step in enumerate(self.template_comp): + if idx < len(self.template_comp) - 1: # create an empty for returning class for strongly-type GP step_ret_type_name = 'Ret_{}'.format(idx) step_ret_type = type(step_ret_type_name, (object,), {}) @@ -485,7 +496,17 @@ def _add_operators(self): step_in_type = ret_types[idx-1] else: step_in_type = np.ndarray - if main_type.count(step): # if the step is a main type + if step == "RandomTree": + for operator in self.operators: + arg_types = operator.parameter_types()[0][1:] + p_types = ([step_in_type] + arg_types, step_ret_type) + self._pset.addPrimitive(operator, *p_types) + if not op_list.count(operator.__name__): + self._import_hash(operator) + self._add_terminals(arg_types) + op_list.append(operator.__name__) + self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) + elif main_type.count(step): # if the step is a main type for operator in self.operators: arg_types = operator.parameter_types()[0][1:] if operator.type() == step: @@ -505,25 +526,6 @@ def _add_operators(self): self._import_hash(operator) self._add_terminals(arg_types) op_list.append(operator.__name__) - else: # no template and randomly generated pipeline - for operator in self.operators: - arg_types = operator.parameter_types()[0][1:] - if operator.root: - # We need to add rooted primitives twice so that they can - # return both an Output_Array (and thus be the root of the tree), - # and return a np.ndarray so they can exist elsewhere in the tree. - p_types = (operator.parameter_types()[0], Output_Array) - self._pset.addPrimitive(operator, *p_types) - - self._pset.addPrimitive(operator, *operator.parameter_types()) - - # Import required modules into local namespace so that pipelines - # may be evaluated directly - self._import_hash(operator) - self._add_terminals(arg_types) - - - self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray) self.ret_types = [np.ndarray, Output_Array] + ret_types @@ -562,10 +564,10 @@ def _setup_toolbox(self): self._toolbox.register('compile', self._compile_to_sklearn) self._toolbox.register('select', tools.selNSGA2) self._toolbox.register('mate', self._mate_operator) - if self.fixed_length: - self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max) - else: + if self.tree_structure: self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max + 1) + else: + self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max) self._toolbox.register('mutate', self._random_mutation_operator) def fit(self, features, target, sample_weight=None, groups=None): @@ -1415,9 +1417,7 @@ def _random_mutation_operator(self, individual, allow_shrink=True): Returns the individual with one of the mutations applied to it """ - if self.fixed_length: - mutation_techniques = [partial(mutNodeReplacement, pset=self._pset)] - else: + if self.tree_structure: mutation_techniques = [ partial(gp.mutInsert, pset=self._pset), partial(mutNodeReplacement, pset=self._pset) @@ -1426,6 +1426,8 @@ def _random_mutation_operator(self, individual, allow_shrink=True): number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual]) if number_of_primitives > 1 and allow_shrink: mutation_techniques.append(partial(gp.mutShrink)) + else: + mutation_techniques = [partial(mutNodeReplacement, pset=self._pset)] mutator = np.random.choice(mutation_techniques) From 6a7937a2d7264f7775710aa31ac176fd3e1d353d Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 7 May 2018 11:42:09 -0400 Subject: [PATCH 26/60] clean codes --- tpot/base.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 53ad0dac..9abed703 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -102,7 +102,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, scoring=None, cv=5, subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, - random_state=None, config_dict=None, template=None, + random_state=None, config_dict=None, template='RandomTree', warm_start=False, memory=None, periodic_checkpoint_folder=None, early_stop=None, verbosity=0, disable_update_check=False): @@ -200,7 +200,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, String 'TPOT sparse': TPOT uses a configuration dictionary with a one-hot-encoder and the operators normally included in TPOT that also support sparse matrices. - template: string (default: None) + template: string (default: 'RandomTree') A template for pipeline structure warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from @@ -349,20 +349,14 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self._setup_toolbox() def _setup_template(self, template): - if tempfile: - self.template = template - else: - if self.classification: - self.template = 'RandomTree-Classifier' - else: - self.template = 'RandomTree-Regressor' + self.template = template self.template_comp = self.template.split('-') - self._min = -1 - self._max = 0 + self._min = 0 + self._max = 1 for comp in self.template_comp: self._min += 1 if comp == 'RandomTree': - self._min += 2 + self._max += 2 else: self._max += 1 if self._max - self._min == 1: @@ -500,7 +494,13 @@ def _add_operators(self): for operator in self.operators: arg_types = operator.parameter_types()[0][1:] p_types = ([step_in_type] + arg_types, step_ret_type) - self._pset.addPrimitive(operator, *p_types) + if operator.root: + # We need to add rooted primitives twice so that they can + # return both an Output_Array (and thus be the root of the tree), + # and return a np.ndarray so they can exist elsewhere in the tree. + self._pset.addPrimitive(operator, *p_types) + tree_p_types = ([step_in_type] + arg_types, step_in_type) + self._pset.addPrimitive(operator, *tree_p_types) if not op_list.count(operator.__name__): self._import_hash(operator) self._add_terminals(arg_types) From 95870f8bc73d123a0ff6d78be8e3ac37e5817bea Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 7 May 2018 16:36:52 -0400 Subject: [PATCH 27/60] add combineDFs in pipeline --- tpot/base.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 9abed703..129af482 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -476,7 +476,25 @@ def _add_operators(self): main_type = ["Classifier", "Regressor", "Selector", "Transformer"] ret_types = [] op_list = [] - if self.template: + if self.template == "RandomTree": # default + step_in_type = np.ndarray + step_ret_type = Output_Array + for operator in self.operators: + arg_types = operator.parameter_types()[0][1:] + p_types = ([step_in_type] + arg_types, step_ret_type) + if operator.root: + # We need to add rooted primitives twice so that they can + # return both an Output_Array (and thus be the root of the tree), + # and return a np.ndarray so they can exist elsewhere in the tree. + self._pset.addPrimitive(operator, *p_types) + tree_p_types = ([step_in_type] + arg_types, step_in_type) + self._pset.addPrimitive(operator, *tree_p_types) + if not op_list.count(operator.__name__): + self._import_hash(operator) + self._add_terminals(arg_types) + op_list.append(operator.__name__) + self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) + else: for idx, step in enumerate(self.template_comp): if idx < len(self.template_comp) - 1: # create an empty for returning class for strongly-type GP @@ -490,22 +508,8 @@ def _add_operators(self): step_in_type = ret_types[idx-1] else: step_in_type = np.ndarray - if step == "RandomTree": - for operator in self.operators: - arg_types = operator.parameter_types()[0][1:] - p_types = ([step_in_type] + arg_types, step_ret_type) - if operator.root: - # We need to add rooted primitives twice so that they can - # return both an Output_Array (and thus be the root of the tree), - # and return a np.ndarray so they can exist elsewhere in the tree. - self._pset.addPrimitive(operator, *p_types) - tree_p_types = ([step_in_type] + arg_types, step_in_type) - self._pset.addPrimitive(operator, *tree_p_types) - if not op_list.count(operator.__name__): - self._import_hash(operator) - self._add_terminals(arg_types) - op_list.append(operator.__name__) - self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) + if step == 'CombineDFs': + self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_ret_type) elif main_type.count(step): # if the step is a main type for operator in self.operators: arg_types = operator.parameter_types()[0][1:] From a0dfe3e7fa51513bbd8f08eb245c15f41b90091b Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Tue, 8 May 2018 09:56:08 -0400 Subject: [PATCH 28/60] more complex tree sturture --- tpot/base.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 129af482..753d4372 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -355,7 +355,7 @@ def _setup_template(self, template): self._max = 1 for comp in self.template_comp: self._min += 1 - if comp == 'RandomTree': + if comp == 'RandomTree' or 'CombineDFs': self._max += 2 else: self._max += 1 @@ -496,20 +496,22 @@ def _add_operators(self): self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) else: for idx, step in enumerate(self.template_comp): - if idx < len(self.template_comp) - 1: - # create an empty for returning class for strongly-type GP - step_ret_type_name = 'Ret_{}'.format(idx) - step_ret_type = type(step_ret_type_name, (object,), {}) - ret_types.append(step_ret_type) - else: - step_ret_type = Output_Array # input class in each step if idx: - step_in_type = ret_types[idx-1] + step_in_type = ret_types[-1] else: step_in_type = np.ndarray - if step == 'CombineDFs': - self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_ret_type) + if step != 'CombineDFs': + if idx < len(self.template_comp) - 1: + # create an empty for returning class for strongly-type GP + step_ret_type_name = 'Ret_{}'.format(idx) + step_ret_type = type(step_ret_type_name, (np.ndarray,), {}) + ret_types.append(step_ret_type) + else: + step_ret_type = Output_Array + + if step == 'CombineDFs': # somehows CombineDFs only accept np.ndarray as input/ret + self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) elif main_type.count(step): # if the step is a main type for operator in self.operators: arg_types = operator.parameter_types()[0][1:] From 0024a78816ae8f37d50e13b4983bee80f0505b0d Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Tue, 8 May 2018 16:13:33 -0400 Subject: [PATCH 29/60] more complex tree stucture --- tpot/base.py | 53 +++++++++++++++++++++++++--------------------- tpot/decorators.py | 11 +++++++--- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 753d4372..6a34de12 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -348,17 +348,22 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self._setup_pset() self._setup_toolbox() + def _setup_template(self, template): self.template = template - self.template_comp = self.template.split('-') - self._min = 0 - self._max = 1 - for comp in self.template_comp: - self._min += 1 - if comp == 'RandomTree' or 'CombineDFs': - self._max += 2 - else: - self._max += 1 + self.template_comp = template.split('-') + if self.template == 'RandomTree': + self._min = 1 + self._max = 3 + else: + self._min = 0 + self._max = 1 + for comp in self.template_comp: + if comp == 'CombineDFs': + self._max += 2 + else: + self._max += 1 + self._min += 1 if self._max - self._min == 1: self.tree_structure = False else: @@ -439,6 +444,7 @@ def _setup_config(self, config_dict): else: self.config_dict = self.default_config_dict + def _read_config_file(self, config_path): if os.path.isfile(config_path): try: @@ -459,6 +465,7 @@ def _read_config_file(self, config_path): '{}'.format(config_path) ) + def _setup_pset(self): if self.random_state is not None: random.seed(self.random_state) @@ -475,7 +482,7 @@ def _setup_pset(self): def _add_operators(self): main_type = ["Classifier", "Regressor", "Selector", "Transformer"] ret_types = [] - op_list = [] + self.op_list = [] if self.template == "RandomTree": # default step_in_type = np.ndarray step_ret_type = Output_Array @@ -489,12 +496,10 @@ def _add_operators(self): self._pset.addPrimitive(operator, *p_types) tree_p_types = ([step_in_type] + arg_types, step_in_type) self._pset.addPrimitive(operator, *tree_p_types) - if not op_list.count(operator.__name__): - self._import_hash(operator) - self._add_terminals(arg_types) - op_list.append(operator.__name__) + self._import_hash_and_add_terminals(operator, arg_types) self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) else: + gp_types = {} for idx, step in enumerate(self.template_comp): # input class in each step if idx: @@ -509,8 +514,7 @@ def _add_operators(self): ret_types.append(step_ret_type) else: step_ret_type = Output_Array - - if step == 'CombineDFs': # somehows CombineDFs only accept np.ndarray as input/ret + if step == 'CombineDFs': self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) elif main_type.count(step): # if the step is a main type for operator in self.operators: @@ -518,23 +522,24 @@ def _add_operators(self): if operator.type() == step: p_types = ([step_in_type] + arg_types, step_ret_type) self._pset.addPrimitive(operator, *p_types) - if not op_list.count(operator.__name__): - self._import_hash(operator) - self._add_terminals(arg_types) - op_list.append(operator.__name__) + self._import_hash_and_add_terminals(operator, arg_types) else: # is the step is a specific operator for operator in self.operators: arg_types = operator.parameter_types()[0][1:] if operator.__name__ == step: p_types = ([step_in_type] + arg_types, step_ret_type) self._pset.addPrimitive(operator, *p_types) - if not op_list.count(operator.__name__): - self._import_hash(operator) - self._add_terminals(arg_types) - op_list.append(operator.__name__) + self._import_hash_and_add_terminals(operator, arg_types) self.ret_types = [np.ndarray, Output_Array] + ret_types + def _import_hash_and_add_terminals(self, operator, arg_types): + if not self.op_list.count(operator.__name__): + self._import_hash(operator) + self._add_terminals(arg_types) + self.op_list.append(operator.__name__) + + def _import_hash(self, operator): # Import required modules into local namespace so that pipelines # may be evaluated directly diff --git a/tpot/decorators.py b/tpot/decorators.py index e8b6a17e..43c05a14 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -64,12 +64,17 @@ def check_pipeline(self, *args, **kwargs): # clone individual before each func call so it is not altered for # the possible next cycle loop args = [self._toolbox.clone(arg) if isinstance(arg, creator.Individual) else arg for arg in args] - try: with warnings.catch_warnings(): warnings.simplefilter('ignore') - - expr = func(self, *args, **kwargs) + expr = None + num_test_expr = 0 + while not expr and num_test_expr < int(NUM_TESTS/2): + try: + expr = func(self, *args, **kwargs) + except: + num_test_expr += 1 + pass # mutation operator returns tuple (ind,); crossover operator # returns tuple of (ind1, ind2) expr_tuple = expr if isinstance(expr, tuple) else (expr,) From d81e3c9e6b02291a069b80c985bc6962ee79971d Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Mon, 18 Jun 2018 14:51:29 -0400 Subject: [PATCH 30/60] update base for pd df input --- tpot/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tpot/base.py b/tpot/base.py index 6a34de12..f112f238 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -40,6 +40,7 @@ from shutil import rmtree import numpy as np +from pandas import DataFrame from scipy import sparse import deap from deap import base, creator, tools, gp @@ -1102,7 +1103,11 @@ def _check_dataset(self, features, target): features = self._impute_values(features) try: X, y = check_X_y(features, target, accept_sparse=True, dtype=None) - return X, y + if isinstance(features, DataFrame): + return features, target + else: + return X, y + except (AssertionError, ValueError): raise ValueError( 'Error: Input data is not in a valid format. Please confirm ' From dcda20d091644f960a2451f1738c38f3a22fc77a Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Thu, 28 Jun 2018 11:38:50 -0400 Subject: [PATCH 31/60] add support for CLI for template --- docs_sources/using.md | 8 ++++++++ tests/driver_tests.py | 3 +++ tpot/driver.py | 12 ++++++++++++ 3 files changed, 23 insertions(+) diff --git a/docs_sources/using.md b/docs_sources/using.md index eda558aa..9d949317 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -282,6 +282,14 @@ See the built-in configurations +-template +TEMPLATE +String +Template for pipeline structure. +

+For example: -template 'Selector-Transformer-Classifier' + + -memory MEMORY String or file path diff --git a/tests/driver_tests.py b/tests/driver_tests.py index c39bfba8..cd37c124 100644 --- a/tests/driver_tests.py +++ b/tests/driver_tests.py @@ -242,6 +242,7 @@ def test_default_param(self): self.assertEqual(args.SUBSAMPLE, 1.0) self.assertEqual(args.SCORING_FN, None) self.assertEqual(args.TARGET_NAME, 'class') + self.assertEqual(args.TEMPLATE, 'RandomTree') self.assertEqual(args.TPOT_MODE, 'classification') self.assertEqual(args.VERBOSITY, 1) @@ -278,6 +279,7 @@ def test_print_args(self): SCORING_FN = accuracy SUBSAMPLE = 1.0 TARGET_NAME = class +TEMPLATE = RandomTree TPOT_MODE = classification VERBOSITY = 1 @@ -320,6 +322,7 @@ def test_print_args_2(self): SCORING_FN = neg_mean_squared_error SUBSAMPLE = 1.0 TARGET_NAME = class +TEMPLATE = RandomTree TPOT_MODE = regression VERBOSITY = 1 diff --git a/tpot/driver.py b/tpot/driver.py index 8ca54764..d854d183 100755 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -354,6 +354,17 @@ def _get_arg_parser(): ) ) + parser.add_argument( + '-template', + action='store', + dest='TEMPLATE', + default='RandomTree', + type=str, + help=( + 'Template for pipeline structure' + ) + ) + parser.add_argument( '-memory', @@ -523,6 +534,7 @@ def tpot_driver(args): max_eval_time_mins=args.MAX_EVAL_MINS, random_state=args.RANDOM_STATE, config_dict=args.CONFIG_FILE, + template=args.TEMPLATE, memory=args.MEMORY, periodic_checkpoint_folder=args.CHECKPOINT_FOLDER, early_stop=args.EARLY_STOP, From cb3e6e6ce7dd457c31e9966765a2e960771acb10 Mon Sep 17 00:00:00 2001 From: weixuanfu2016 Date: Thu, 28 Jun 2018 11:41:30 -0400 Subject: [PATCH 32/60] clean docs --- docs_sources/using.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs_sources/using.md b/docs_sources/using.md index 9d949317..6c2d6e3b 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -287,7 +287,7 @@ See the
built-in configurations String Template for pipeline structure.

-For example: -template 'Selector-Transformer-Classifier' +For example: -template Selector-Transformer-Classifier -memory From c1f0095c196ce5cf358eca95d69b8a016410be84 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 7 Aug 2018 12:21:41 -0400 Subject: [PATCH 33/60] fix fail pipeline during init and mutation --- tpot/base.py | 10 ++++++++++ tpot/builtins/dataset_selector.py | 5 +++++ tpot/decorators.py | 19 +++++-------------- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 90c6d661..b2217534 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -521,6 +521,7 @@ def _add_operators(self): else: gp_types = {} for idx, step in enumerate(self.template_comp): + # input class in each step if idx: step_in_type = ret_types[-1] @@ -642,6 +643,8 @@ def fit(self, features, target, sample_weight=None, groups=None): features, target = self._check_dataset(features, target, sample_weight) + self.pretest_X, _, self.pretest_y, _ = train_test_split(features, target, train_size=min(50, features.shape[0]), random_state=self.random_state) + # Randomly collect a subsample of training samples for pipeline optimization process. if self.subsample < 1.0: features, _, target, _ = train_test_split(features, target, train_size=self.subsample, random_state=self.random_state) @@ -1343,6 +1346,12 @@ def _preprocess_individuals(self, individuals): # Disallow certain combinations of operators because they will take too long or take up too much RAM # This is a fairly hacky way to prevent TPOT from getting stuck on bad pipelines and should be improved in a future release individual_str = str(individual) + if not len(individual): # a pipeline cannot be randomly generated + self.evaluated_individuals_[individual_str] = self._combine_individual_stats(5000., + -float('inf'), + individual.statistics) + self._update_pbar(pbar_msg='Invalid pipeline encountered. Skipping its evaluation.') + continue sklearn_pipeline_str = generate_pipeline_code(expr_to_tree(individual, self._pset), self.operators) if sklearn_pipeline_str.count('PolynomialFeatures') > 1: self.evaluated_individuals_[individual_str] = self._combine_individual_stats(5000., @@ -1540,6 +1549,7 @@ def condition(height, depth, type_): return self._generate(pset, min_, max_, condition, type_) + def _operator_count(self, individual): """Count the number of pipeline operators as a measure of pipeline complexity. diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index a649a701..fcbe6de5 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -14,6 +14,11 @@ class DatasetSelector(BaseEstimator, TransformerMixin): """Select predefined data subsets.""" + @property + def __name__(self): + """Instance name is the same as the class name.""" + return self.__class__.__name__ + def __init__(self, subset_dir, sel_subset_fname): """Create a DatasetSelector object. diff --git a/tpot/decorators.py b/tpot/decorators.py index 43c05a14..adf519eb 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -26,18 +26,11 @@ from __future__ import print_function from functools import wraps import warnings -from sklearn.datasets import make_classification, make_regression from .export_utils import expr_to_tree, generate_pipeline_code from deap import creator NUM_TESTS = 10 -# generate a small data set for a new pipeline, in order to check if the pipeline -# has unsuppported combinations in params -pretest_X, pretest_y = make_classification(n_samples=100, n_features=10, random_state=42) -pretest_X_reg, pretest_y_reg = make_regression(n_samples=100, n_features=10, random_state=42) - - def _pre_test(func): """Check if the wrapped function works with a pretest data set. @@ -67,9 +60,10 @@ def check_pipeline(self, *args, **kwargs): try: with warnings.catch_warnings(): warnings.simplefilter('ignore') - expr = None + expr = [] num_test_expr = 0 - while not expr and num_test_expr < int(NUM_TESTS/2): + # to ensure a pipeline can be generated or mutated. + while not expr and num_test_expr < NUM_TESTS/2: try: expr = func(self, *args, **kwargs) except: @@ -86,10 +80,8 @@ def check_pipeline(self, *args, **kwargs): ) sklearn_pipeline = eval(pipeline_code, self.operators_context) - if self.classification: - sklearn_pipeline.fit(pretest_X, pretest_y) - else: - sklearn_pipeline.fit(pretest_X_reg, pretest_y_reg) + sklearn_pipeline.fit(self.pretest_X, self.pretest_y) + bad_pipeline = False except BaseException as e: message = '_pre_test decorator: {fname}: num_test={n} {e}'.format( @@ -101,7 +93,6 @@ def check_pipeline(self, *args, **kwargs): self._update_pbar(pbar_num=0, pbar_msg=message) finally: num_test += 1 - return expr return check_pipeline From 38e6a559d579ecdbed3ef9a8276b95a92e0dba3c Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 7 Aug 2018 12:28:58 -0400 Subject: [PATCH 34/60] fix unit tests --- tests/export_tests.py | 9 +++++---- tpot/decorators.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/export_tests.py b/tests/export_tests.py index a7ee1791..b5232623 100644 --- a/tests/export_tests.py +++ b/tests/export_tests.py @@ -62,10 +62,10 @@ def test_export_random_ind(): pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd -from sklearn.feature_selection import SelectPercentile, f_classif +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline -from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) @@ -74,13 +74,14 @@ def test_export_random_ind(): train_test_split(features, tpot_data['target'].values, random_state=42) exported_pipeline = make_pipeline( - SelectPercentile(score_func=f_classif, percentile=65), - DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18) + VarianceThreshold(threshold=0.05), + GradientBoostingClassifier(learning_rate=0.01, max_depth=5, max_features=0.9000000000000001, min_samples_leaf=11, min_samples_split=17, n_estimators=100, subsample=0.25) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ + assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset) diff --git a/tpot/decorators.py b/tpot/decorators.py index adf519eb..6bdc2983 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -63,7 +63,7 @@ def check_pipeline(self, *args, **kwargs): expr = [] num_test_expr = 0 # to ensure a pipeline can be generated or mutated. - while not expr and num_test_expr < NUM_TESTS/2: + while not len(expr) and num_test_expr < int(NUM_TESTS/2): try: expr = func(self, *args, **kwargs) except: From 7d04263434496098e147f01b69facce9e455b8f1 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 7 Aug 2018 12:33:51 -0400 Subject: [PATCH 35/60] clean codes --- tests/tpot_tests.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index a417de9c..2528fdf9 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -30,7 +30,6 @@ from tpot.gp_deap import mutNodeReplacement, _wrapped_cross_val_score, pick_two_individuals_eligible_for_crossover, cxOnePoint, varOr, initialize_stats_dict from tpot.metrics import balanced_accuracy, SCORERS from tpot.operator_utils import TPOTOperatorClassFactory, set_sample_weight -from tpot.decorators import pretest_X, pretest_y from tpot.config.classifier import classifier_config_dict from tpot.config.classifier_light import classifier_config_dict_light @@ -54,7 +53,7 @@ from tempfile import mkdtemp from shutil import rmtree -from sklearn.datasets import load_digits, load_boston +from sklearn.datasets import load_digits, load_boston, make_classification from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold from sklearn.externals.joblib import Memory from sklearn.metrics import make_scorer, roc_auc_score @@ -92,6 +91,9 @@ def closing(arg): training_features_r, testing_features_r, training_target_r, testing_target_r = \ train_test_split(boston_data.data, boston_data.target, random_state=42) +# Set up a small test dataset + +pretest_X, pretest_y = make_classification(n_samples=100, n_features=10, random_state=42) # Set up pandas DataFrame for testing input_data = pd.read_csv( @@ -1764,7 +1766,7 @@ def test_PolynomialFeatures_exception(): initialize_stats_dict(pipeline) fitness_scores = tpot_obj._evaluate_individuals(pipelines, pretest_X, pretest_y) - + assert fitness_scores[0][0] == 2 assert fitness_scores[1][0] == 5000.0 From f152452892583488be8c17bc0657bf6c6cd3a4de Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 7 Aug 2018 13:08:40 -0400 Subject: [PATCH 36/60] clean codes --- tpot/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index b2217534..6a32033a 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -643,11 +643,11 @@ def fit(self, features, target, sample_weight=None, groups=None): features, target = self._check_dataset(features, target, sample_weight) - self.pretest_X, _, self.pretest_y, _ = train_test_split(features, target, train_size=min(50, features.shape[0]), random_state=self.random_state) + self.pretest_X, _, self.pretest_y, _ = train_test_split(features, target, train_size=min(50, features.shape[0]), test_size=None, random_state=self.random_state) # Randomly collect a subsample of training samples for pipeline optimization process. if self.subsample < 1.0: - features, _, target, _ = train_test_split(features, target, train_size=self.subsample, random_state=self.random_state) + features, _, target, _ = train_test_split(features, target, train_size=self.subsample, test_size=None, random_state=self.random_state) # Raise a warning message if the training size is less than 1500 when subsample is not default value if features.shape[0] < 1500: print( From 5330515828bcce7a9a169005efd4979d6b757c73 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Wed, 22 Aug 2018 13:24:10 -0400 Subject: [PATCH 37/60] refine return from dataset selector and use object for ret type --- tpot/base.py | 2 +- tpot/builtins/dataset_selector.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 6a32033a..911f5157 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -531,7 +531,7 @@ def _add_operators(self): if idx < len(self.template_comp) - 1: # create an empty for returning class for strongly-type GP step_ret_type_name = 'Ret_{}'.format(idx) - step_ret_type = type(step_ret_type_name, (np.ndarray,), {}) + step_ret_type = type(step_ret_type_name, (object,), {}) ret_types.append(step_ret_type) else: step_ret_type = Output_Array diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index fcbe6de5..3125b5bd 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -81,7 +81,7 @@ def transform(self, X): The transformed feature set. """ if isinstance(X, pd.DataFrame): - X_transformed = X[self.feat_list].values + X_transformed = X[self.feat_list] elif isinstance(X, np.ndarray): X_transformed = X[:, self.feat_list] From ca3fdab4819d2f3b2d4e4c1869efac526f9c9acd Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Wed, 12 Sep 2018 11:11:07 -0400 Subject: [PATCH 38/60] fix unit test --- tests/test_dask_based.py | 4 ++-- tests/tpot_tests.py | 6 +++++- tpot/base.py | 10 ++++++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/test_dask_based.py b/tests/test_dask_based.py index b042edab..0b2f2c89 100644 --- a/tests/test_dask_based.py +++ b/tests/test_dask_based.py @@ -28,7 +28,7 @@ def test_dask_matches(self): cv=3, random_state=0, n_jobs=n_jobs, - use_dask=False, + use_dask=False ) b = TPOTClassifier( generations=2, @@ -36,7 +36,7 @@ def test_dask_matches(self): cv=3, random_state=0, n_jobs=n_jobs, - use_dask=True, + use_dask=True ) b.fit(X, y) a.fit(X, y) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index 54cb2b07..15300410 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -1563,6 +1563,7 @@ def test_check_dataset(): verbosity=0, config_dict='TPOT light' ) + tpot_obj._fit_init() ret_features, ret_target = tpot_obj._check_dataset(training_features, training_target) assert np.allclose(ret_features, training_features) @@ -1579,6 +1580,7 @@ def test_check_dataset_2(): verbosity=0, config_dict='TPOT light' ) + tpot_obj._fit_init() test_sample_weight = list(range(1, len(training_target)+1)) ret_features, ret_target = tpot_obj._check_dataset(training_features, training_target, test_sample_weight) test_sample_weight[0] = 'opps' @@ -1596,6 +1598,7 @@ def test_check_dataset_3(): verbosity=0, config_dict='TPOT light' ) + tpot_obj._fit_init() test_sample_weight = list(range(1, len(training_target)+1)) ret_features, ret_target = tpot_obj._check_dataset(training_features, training_target, test_sample_weight) test_sample_weight[0] = np.nan @@ -1613,6 +1616,7 @@ def test_check_dataset_4(): verbosity=0, config_dict='TPOT light' ) + tpot_obj._fit_init() test_sample_weight = list(range(1, len(training_target))) assert_raises(ValueError, tpot_obj._check_dataset, training_features, training_target, test_sample_weight) @@ -1627,7 +1631,7 @@ def test_check_dataset_5(): verbosity=0, config_dict='TPOT light' ) - + tpot_obj._fit_init() ret_features = tpot_obj._check_dataset(training_features, target=None) assert np.allclose(ret_features, training_features) diff --git a/tpot/base.py b/tpot/base.py index 4c84492f..44832031 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -1145,8 +1145,14 @@ def _check_dataset(self, features, target, sample_weight=None): 'customized config dictionary supports sparse matriies.' ) else: - if np.any(np.isnan(features)): - self._imputed = True + if isinstance(features, np.ndarray): + if np.any(np.isnan(features)): + self._imputed = True + elif isinstance(features, DataFrame): + if features.isnull().values.any(): + self._imputed = True + + if self._imputed: features = self._impute_values(features) try: From b77a5b6b0adefa6f45f0a6ca98c65898c5693ca0 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Wed, 12 Sep 2018 11:13:41 -0400 Subject: [PATCH 39/60] change random set for fixing a unit test --- tests/test_dask_based.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_dask_based.py b/tests/test_dask_based.py index 0b2f2c89..6ac9c481 100644 --- a/tests/test_dask_based.py +++ b/tests/test_dask_based.py @@ -26,7 +26,7 @@ def test_dask_matches(self): generations=2, population_size=5, cv=3, - random_state=0, + random_state=42, n_jobs=n_jobs, use_dask=False ) @@ -34,7 +34,7 @@ def test_dask_matches(self): generations=2, population_size=5, cv=3, - random_state=0, + random_state=42, n_jobs=n_jobs, use_dask=True ) From 3b348e961310a7434d9359bd69fa2131e644f500 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Mon, 17 Sep 2018 16:34:37 -0400 Subject: [PATCH 40/60] fix bug in decorators for mut/xo --- tpot/base.py | 25 ++++++++------ tpot/builtins/dataset_selector.py | 6 ++-- tpot/decorators.py | 54 ++++++++++++++++++------------- tpot/gp_deap.py | 4 +-- 4 files changed, 53 insertions(+), 36 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 44832031..636fee09 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -466,11 +466,12 @@ def _add_operators(self): self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) elif main_type.count(step): # if the step is a main type for operator in self.operators: - arg_types = operator.parameter_types()[0][1:] - if operator.type() == step: - p_types = ([step_in_type] + arg_types, step_ret_type) - self._pset.addPrimitive(operator, *p_types) - self._import_hash_and_add_terminals(operator, arg_types) + if operator.__name__ not in self.op_list: + arg_types = operator.parameter_types()[0][1:] + if operator.type() == step: + p_types = ([step_in_type] + arg_types, step_ret_type) + self._pset.addPrimitive(operator, *p_types) + self._import_hash_and_add_terminals(operator, arg_types) else: # is the step is a specific operator for operator in self.operators: arg_types = operator.parameter_types()[0][1:] @@ -657,9 +658,9 @@ def fit(self, features, target, sample_weight=None, groups=None): """ self._fit_init() - features, target = self._check_dataset(features, target, sample_weight) + self.pretest_X, _, self.pretest_y, _ = train_test_split(features, target, train_size=min(50, features.shape[0]), test_size=None, random_state=self.random_state) # Randomly collect a subsample of training samples for pipeline optimization process. @@ -1151,17 +1152,23 @@ def _check_dataset(self, features, target, sample_weight=None): elif isinstance(features, DataFrame): if features.isnull().values.any(): self._imputed = True - + if self._imputed: features = self._impute_values(features) try: if target is not None: X, y = check_X_y(features, target, accept_sparse=True, dtype=None) - return X, y + if self._imputed: + return X, y + else: + return features, target else: X = check_array(features, accept_sparse=True, dtype=None) - return X + if self._imputed: + return X + else: + return features except (AssertionError, ValueError): raise ValueError( 'Error: Input data is not in a valid format. Please confirm ' diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index 3125b5bd..79f06073 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -65,6 +65,8 @@ def fit(self, X, y=None): self.feature_names = list(range(X.shape[1])) feature_i = [int(val) for val in features_i_df.values.flatten()] self.feat_list = list(set(feature_i).intersection(set(self.feature_names))) + if not len(self.feat_list): + raise ValueError('No feature is found on the subset list!') return self def transform(self, X): @@ -81,8 +83,8 @@ def transform(self, X): The transformed feature set. """ if isinstance(X, pd.DataFrame): - X_transformed = X[self.feat_list] + X_transformed = X[self.feat_list].values elif isinstance(X, np.ndarray): X_transformed = X[:, self.feat_list] - return X_transformed + return X_transformed.astype(np.float64) diff --git a/tpot/decorators.py b/tpot/decorators.py index 6bdc2983..f03ac5e7 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -58,41 +58,49 @@ def check_pipeline(self, *args, **kwargs): # the possible next cycle loop args = [self._toolbox.clone(arg) if isinstance(arg, creator.Individual) else arg for arg in args] try: - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - expr = [] - num_test_expr = 0 - # to ensure a pipeline can be generated or mutated. - while not len(expr) and num_test_expr < int(NUM_TESTS/2): - try: - expr = func(self, *args, **kwargs) - except: - num_test_expr += 1 - pass - # mutation operator returns tuple (ind,); crossover operator - # returns tuple of (ind1, ind2) - expr_tuple = expr if isinstance(expr, tuple) else (expr,) - - for expr_test in expr_tuple: - pipeline_code = generate_pipeline_code( - expr_to_tree(expr_test, self._pset), - self.operators - ) - sklearn_pipeline = eval(pipeline_code, self.operators_context) + if func.__name__ == "_generate": + expr = [] + else: + expr = tuple(args) + pass_gen = False + num_test_expr = 0 + # to ensure a pipeline can be generated or mutated. + while not pass_gen and num_test_expr < int(NUM_TESTS/2): + try: + expr = func(self, *args, **kwargs) + pass_gen = True + except: + num_test_expr += 1 + pass + # mutation operator returns tuple (ind,); crossover operator + # returns tuple of (ind1, ind2) + + expr_tuple = expr if isinstance(expr, tuple) else (expr,) + for expr_test in expr_tuple: + pipeline_code = generate_pipeline_code( + expr_to_tree(expr_test, self._pset), + self.operators + ) + sklearn_pipeline = eval(pipeline_code, self.operators_context) + with warnings.catch_warnings(): + warnings.simplefilter('ignore') sklearn_pipeline.fit(self.pretest_X, self.pretest_y) - bad_pipeline = False + bad_pipeline = False except BaseException as e: - message = '_pre_test decorator: {fname}: num_test={n} {e}'.format( + message = '_pre_test decorator: {fname}: num_test={n} {e}.'.format( n=num_test, fname=func.__name__, e=e + ) # Use the pbar output stream if it's active self._update_pbar(pbar_num=0, pbar_msg=message) finally: num_test += 1 + return expr + return check_pipeline diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index 6757ef52..10ecd4f3 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -242,10 +242,10 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, # after each population save a periodic pipeline if per_generation_function is not None: per_generation_function() - # Vary the population offspring = varOr(population, toolbox, lambda_, cxpb, mutpb) + # Update generation statistic for all individuals which have invalid 'generation' stats # This hold for individuals that have been altered in the varOr function for ind in population: @@ -349,7 +349,7 @@ def mutNodeReplacement(individual, pset): index = np.random.randint(0, len(individual)) node = individual[index] slice_ = individual.searchSubtree(index) - + if node.arity == 0: # Terminal term = np.random.choice(pset.terminals[node.ret]) if isclass(term): From b614de9b66bf2f92406f620a407e20033170509f Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 18 Sep 2018 09:25:50 -0400 Subject: [PATCH 41/60] better pset --- tpot/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/base.py b/tpot/base.py index 636fee09..a40c9d69 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -466,7 +466,7 @@ def _add_operators(self): self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) elif main_type.count(step): # if the step is a main type for operator in self.operators: - if operator.__name__ not in self.op_list: + if operator.__name__ != 'DatasetSelector': arg_types = operator.parameter_types()[0][1:] if operator.type() == step: p_types = ([step_in_type] + arg_types, step_ret_type) From 37f268eeed160994bb8ceab752b8b11d058cdccf Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 18 Sep 2018 09:45:22 -0400 Subject: [PATCH 42/60] better min and max --- tpot/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tpot/base.py b/tpot/base.py index a40c9d69..72d4db32 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -308,6 +308,7 @@ def _setup_template(self, template): for comp in self.template_comp: if comp == 'CombineDFs': self._max += 2 + self._min += 1 else: self._max += 1 self._min += 1 @@ -466,7 +467,7 @@ def _add_operators(self): self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) elif main_type.count(step): # if the step is a main type for operator in self.operators: - if operator.__name__ != 'DatasetSelector': + if operator.__name__ != 'DatasetSelector': # dataset selector is not considered as a main type arg_types = operator.parameter_types()[0][1:] if operator.type() == step: p_types = ([step_in_type] + arg_types, step_ret_type) From ef29cd9bde8695645d63ae7f568fb298809e7b19 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Mon, 15 Oct 2018 14:01:49 -0400 Subject: [PATCH 43/60] refine dataset selector --- tests/dataset_selector_tests.py | 37 +++++++++++- tests/subset_test.csv | 3 + tests/test_subset_dir/test_subset_1.snp | 6 -- tests/test_subset_dir/test_subset_2.snp | 7 --- tpot/builtins/dataset_selector.py | 75 ++++++++++++++++++------- 5 files changed, 94 insertions(+), 34 deletions(-) create mode 100644 tests/subset_test.csv delete mode 100644 tests/test_subset_dir/test_subset_1.snp delete mode 100644 tests/test_subset_dir/test_subset_2.snp diff --git a/tests/dataset_selector_tests.py b/tests/dataset_selector_tests.py index 3cb1cb8f..d4c3b4b5 100644 --- a/tests/dataset_selector_tests.py +++ b/tests/dataset_selector_tests.py @@ -33,7 +33,7 @@ def test_DatasetSelector_1(): """Assert that the StackingEstimator returns transformed X based on test feature list 1.""" - ds = DatasetSelector(subset_dir="tests/test_subset_dir", sel_subset_fname="test_subset_1.snp") + ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) @@ -44,7 +44,7 @@ def test_DatasetSelector_1(): def test_DatasetSelector_2(): """Assert that the StackingEstimator returns transformed X based on test feature list 2.""" - ds = DatasetSelector(subset_dir="tests/test_subset_dir", sel_subset_fname="test_subset_2.snp") + ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_2") ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) @@ -52,3 +52,36 @@ def test_DatasetSelector_2(): assert transformed_X.shape[1] != test_X.shape[1] assert transformed_X.shape[1] == 6 assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_DatasetSelector_3(): + """Assert that the StackingEstimator returns transformed X based on 2 subsets' names""" + ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset=["test_subset_1", "test_subset_2"]) + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 7 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_DatasetSelector_4(): + """Assert that the StackingEstimator returns transformed X based on 2 subsets' indexs""" + ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset=[0, 1]) + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 7 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_DatasetSelector_5(): + """Assert that the StackingEstimator returns transformed X seleced based on test feature list 1's index.""" + ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset=0) + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 5 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) diff --git a/tests/subset_test.csv b/tests/subset_test.csv new file mode 100644 index 00000000..2b86a402 --- /dev/null +++ b/tests/subset_test.csv @@ -0,0 +1,3 @@ +Subset,Size,Features +test_subset_1,5,2;4;9;11;14 +test_subset_2,6,2;3;4;5;9;11 diff --git a/tests/test_subset_dir/test_subset_1.snp b/tests/test_subset_dir/test_subset_1.snp deleted file mode 100644 index ff28bcf0..00000000 --- a/tests/test_subset_dir/test_subset_1.snp +++ /dev/null @@ -1,6 +0,0 @@ -test_list_1 -2 -4 -9 -11 -14 diff --git a/tests/test_subset_dir/test_subset_2.snp b/tests/test_subset_dir/test_subset_2.snp deleted file mode 100644 index 0b28d022..00000000 --- a/tests/test_subset_dir/test_subset_2.snp +++ /dev/null @@ -1,7 +0,0 @@ -test_list_2 -2 -3 -4 -5 -9 -11 diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index 79f06073..763c61c1 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -1,9 +1,25 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" -Created on Fri Mar 23 2018 +"""This file is part of the TPOT library. + +TPOT was primarily developed at the University of Pennsylvania by: + - Randal S. Olson (rso@randalolson.com) + - Weixuan Fu (weixuanf@upenn.edu) + - Daniel Angell (dpa34@drexel.edu) + - and many more generous open source contributors + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. -@author: grixor +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . """ import numpy as np import pandas as pd @@ -19,26 +35,31 @@ def __name__(self): """Instance name is the same as the class name.""" return self.__class__.__name__ - def __init__(self, subset_dir, sel_subset_fname): + def __init__(self, subset_list, sel_subset): """Create a DatasetSelector object. Parameters ---------- - subset_dir: directory, required - Path to folder that stores the feature list files. Currently, - each file needs to be a .csv with one header row. The feature - names in these files must match those in the (training and + subset_list: string, required + Path to a file that indicates all the subset lists. Currently, + this file needs to be a .csv with one header row. + There should be 3 columns on the table, including subset names (Subset), + number of features (Size) and features in the subset (Features). + The feature names or indexs of input features + should be seprated by ';' on the 3rd column of the file. + The feature names in the files must match those in the (training and testing) dataset. - sel_subset_fname: string, required - File name of subset - + sel_subset: int or string or list + int: index of subset in subset file + string: subset name of subset + list: list of int or string for indexs or subset names Returns ------- None """ - self.subset_dir = subset_dir - self.sel_subset_fname = sel_subset_fname + self.subset_list = subset_list + self.sel_subset = sel_subset def fit(self, X, y=None): """Fit DatasetSelector for feature selection @@ -55,16 +76,32 @@ def fit(self, X, y=None): self: object Returns a copy of the estimator """ - subset_files = os.listdir(self.subset_dir) - self.subset_i = self.subset_dir + "/" + self.sel_subset_fname - features_i_df = pd.read_csv(self.subset_i, sep='\t', header=0) + subset_df = pd.read_csv(self.subset_list, header=0, index_col=0) + + if isinstance(self.sel_subset, int): + self.sel_subset_name = subset_df.index[self.sel_subset] + elif isinstance(self.sel_subset, list): + self.sel_subset_name = [] + for s in self.sel_subset: + if isinstance(s, int): + self.sel_subset_name.append(subset_df.index[s]) + else: + self.sel_subset_name.append(s) + else: # self.sel_subset is a string + self.sel_subset_name = self.sel_subset + + sel_features = subset_df.loc[self.sel_subset_name, 'Features'] + if not isinstance(sel_features, str): + sel_features = ";".join(sel_features.tolist()) + + sel_uniq_features = set(sel_features.split(';')) + if isinstance(X, pd.DataFrame): # use columns' names self.feature_names = list(X.columns.values) - feature_i = [str(val) for val in features_i_df.values.flatten()] elif isinstance(X, np.ndarray): # use index self.feature_names = list(range(X.shape[1])) - feature_i = [int(val) for val in features_i_df.values.flatten()] - self.feat_list = list(set(feature_i).intersection(set(self.feature_names))) + sel_uniq_features = [int(val) for val in sel_uniq_features] + self.feat_list = list(set(sel_uniq_features).intersection(set(self.feature_names))) if not len(self.feat_list): raise ValueError('No feature is found on the subset list!') return self From fc6545ebd6cd1e5ec1ee981ebc6a3efd3b7bd561 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Mon, 15 Oct 2018 15:41:07 -0400 Subject: [PATCH 44/60] fix a bug --- tests/subset_test.csv | 1 + tpot/builtins/dataset_selector.py | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/subset_test.csv b/tests/subset_test.csv index 2b86a402..d6ba014d 100644 --- a/tests/subset_test.csv +++ b/tests/subset_test.csv @@ -1,3 +1,4 @@ Subset,Size,Features test_subset_1,5,2;4;9;11;14 test_subset_2,6,2;3;4;5;9;11 +test_subset_3,4,12;7;9;11 diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index 763c61c1..728657be 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -49,10 +49,10 @@ def __init__(self, subset_list, sel_subset): should be seprated by ';' on the 3rd column of the file. The feature names in the files must match those in the (training and testing) dataset. - sel_subset: int or string or list + sel_subset: int or string or list or tuple int: index of subset in subset file string: subset name of subset - list: list of int or string for indexs or subset names + list or tuple: list of int or string for indexs or subset names Returns ------- None @@ -80,15 +80,16 @@ def fit(self, X, y=None): if isinstance(self.sel_subset, int): self.sel_subset_name = subset_df.index[self.sel_subset] - elif isinstance(self.sel_subset, list): + elif isinstance(self.sel_subset, str): + self.sel_subset_name = self.sel_subset + else: # list or tuple self.sel_subset_name = [] for s in self.sel_subset: if isinstance(s, int): self.sel_subset_name.append(subset_df.index[s]) else: self.sel_subset_name.append(s) - else: # self.sel_subset is a string - self.sel_subset_name = self.sel_subset + sel_features = subset_df.loc[self.sel_subset_name, 'Features'] if not isinstance(sel_features, str): From d9a668c5086a05199db70f1cdff8b7a0acb5d9cb Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 6 Dec 2018 10:16:17 -0500 Subject: [PATCH 45/60] add sorted feat_list --- tpot/builtins/dataset_selector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index 728657be..096cd5e8 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -102,7 +102,7 @@ def fit(self, X, y=None): elif isinstance(X, np.ndarray): # use index self.feature_names = list(range(X.shape[1])) sel_uniq_features = [int(val) for val in sel_uniq_features] - self.feat_list = list(set(sel_uniq_features).intersection(set(self.feature_names))) + self.feat_list = sorted(list(set(sel_uniq_features).intersection(set(self.feature_names)))) if not len(self.feat_list): raise ValueError('No feature is found on the subset list!') return self From 2587a1e040db905ef054cd97bc05494c48d71767 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 6 Dec 2018 10:31:51 -0500 Subject: [PATCH 46/60] fix a unit test --- tests/export_tests.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/export_tests.py b/tests/export_tests.py index 89a2ce5d..3f7d4596 100644 --- a/tests/export_tests.py +++ b/tests/export_tests.py @@ -68,10 +68,8 @@ def test_export_random_ind(): pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import train_test_split -from sklearn.pipeline import make_pipeline +from sklearn.naive_bayes import MultinomialNB # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) @@ -79,10 +77,7 @@ def test_export_random_ind(): training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=39) -exported_pipeline = make_pipeline( - VarianceThreshold(threshold=0.05), - GradientBoostingClassifier(learning_rate=0.01, max_depth=5, max_features=0.9000000000000001, min_samples_leaf=11, min_samples_split=17, n_estimators=100, subsample=0.25) -) +exported_pipeline = MultinomialNB(alpha=0.1, fit_prior=True) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) From 334b9ee7edc7e60068e22362ceab8754f3ae455a Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Tue, 18 Dec 2018 11:02:53 -0500 Subject: [PATCH 47/60] support eli5 --- tpot/builtins/dataset_selector.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index 096cd5e8..dd059834 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -99,10 +99,14 @@ def fit(self, X, y=None): if isinstance(X, pd.DataFrame): # use columns' names self.feature_names = list(X.columns.values) + self.feat_list = sorted(list(set(sel_uniq_features).intersection(set(self.feature_names)))) + self.feat_list_idx = [list(X.columns).index(feat_name) for feat_name in self.feat_list] elif isinstance(X, np.ndarray): # use index self.feature_names = list(range(X.shape[1])) sel_uniq_features = [int(val) for val in sel_uniq_features] - self.feat_list = sorted(list(set(sel_uniq_features).intersection(set(self.feature_names)))) + self.feat_list = sorted(list(set(sel_uniq_features).intersection(set(self.feature_names)))) + self.feat_list_idx = self.feat_list + if not len(self.feat_list): raise ValueError('No feature is found on the subset list!') return self @@ -123,6 +127,6 @@ def transform(self, X): if isinstance(X, pd.DataFrame): X_transformed = X[self.feat_list].values elif isinstance(X, np.ndarray): - X_transformed = X[:, self.feat_list] + X_transformed = X[:, self.feat_list_idx] return X_transformed.astype(np.float64) From 814a3a23e0aad442be789a68ceae596d26a963e0 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 14 Mar 2019 14:57:30 -0400 Subject: [PATCH 48/60] fix a bug --- tpot/gp_deap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index c342d511..f170c4fc 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -233,7 +233,7 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, for gen in range(1, ngen + 1): # after each population save a periodic pipeline if per_generation_function is not None: - per_generation_function() + per_generation_function(gen) # Vary the population offspring = varOr(population, toolbox, lambda_, cxpb, mutpb) From a0836f59a732e531cf9e24ad2a813036b677531a Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Mon, 18 Mar 2019 09:22:03 -0400 Subject: [PATCH 49/60] fix a bug when sample size is less than 50 --- tpot/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tpot/base.py b/tpot/base.py index 29bcd132..d3ba05d7 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -647,7 +647,9 @@ def fit(self, features, target, sample_weight=None, groups=None): features, target = self._check_dataset(features, target, sample_weight) - self.pretest_X, _, self.pretest_y, _ = train_test_split(features, target, train_size=min(50, features.shape[0]), test_size=None, random_state=self.random_state) + self.pretest_X, _, self.pretest_y, _ = train_test_split(features, + target, train_size=min(50, int(0.9*features.shape[0])), + test_size=None, random_state=self.random_state) # Randomly collect a subsample of training samples for pipeline optimization process. if self.subsample < 1.0: From 40b97c344c6facf81c2ac7abe383647e74dc171e Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 4 Apr 2019 09:16:30 -0400 Subject: [PATCH 50/60] update version number --- tpot/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tpot/_version.py b/tpot/_version.py index ab687bdd..cc571e81 100644 --- a/tpot/_version.py +++ b/tpot/_version.py @@ -23,4 +23,4 @@ """ -__version__ = '0.9.6' +__version__ = '0.10.0' From 9173699453fc87772f52a556f207f4852b51690c Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 4 Apr 2019 09:41:54 -0400 Subject: [PATCH 51/60] add 3 unit tests for template --- tests/tpot_tests.py | 61 +++++++++++++++++++++++++++++++++++++++++++++ tpot/base.py | 2 +- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index ba76043f..d4091d86 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -57,6 +57,8 @@ from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold from sklearn.externals.joblib import Memory from sklearn.metrics import make_scorer, roc_auc_score +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +from sklearn.feature_selection.base import SelectorMixin from deap import creator, gp from deap.tools import ParetoFront from nose.tools import assert_raises, assert_not_equal, assert_greater_equal, assert_equal, assert_in @@ -597,6 +599,65 @@ def test_sample_weight_func(): assert np.allclose(known_score, score) +def test_template_1(): + """Assert that TPOT template option generates pipeline when each step is a type of operator.""" + + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + template='Selector-Transformer-Classifier' + ) + tpot_obj._fit_init() + pop = tpot_obj._toolbox.population(n=10) + for deap_pipeline in pop: + operator_count = tpot_obj._operator_count(deap_pipeline) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + assert operator_count == 3 + assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin) + assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin) + assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin) + + +def test_template_2(): + """Assert that TPOT template option generates pipeline when each step is operator type with a duplicate main type.""" + + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + template='Selector-Selector-Transformer-Classifier' + ) + tpot_obj._fit_init() + pop = tpot_obj._toolbox.population(n=10) + for deap_pipeline in pop: + operator_count = tpot_obj._operator_count(deap_pipeline) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + assert operator_count == 4 + assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin) + assert issubclass(sklearn_pipeline.steps[1][1].__class__, SelectorMixin) + assert issubclass(sklearn_pipeline.steps[2][1].__class__, TransformerMixin) + assert issubclass(sklearn_pipeline.steps[3][1].__class__, ClassifierMixin) + + +def test_template_3(): + """Assert that TPOT template option generates pipeline when one of steps is a specific operator.""" + + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + template='SelectPercentile-Transformer-Classifier' + ) + tpot_obj._fit_init() + pop = tpot_obj._toolbox.population(n=10) + for deap_pipeline in pop: + operator_count = tpot_obj._operator_count(deap_pipeline) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + assert operator_count == 3 + assert sklearn_pipeline.steps[0][0] == 'SelectPercentile'.lower() + assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin) + assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin) + assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin) + + def test_fit_GroupKFold(): """Assert that TPOT properly handles the group parameter when using GroupKFold.""" # This check tests if the darker MNIST images would generalize to the lighter ones. diff --git a/tpot/base.py b/tpot/base.py index d3ba05d7..efcc2413 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -648,7 +648,7 @@ def fit(self, features, target, sample_weight=None, groups=None): self.pretest_X, _, self.pretest_y, _ = train_test_split(features, - target, train_size=min(50, int(0.9*features.shape[0])), + target, train_size=min(50, int(0.9*features.shape[0])), test_size=None, random_state=self.random_state) # Randomly collect a subsample of training samples for pipeline optimization process. From 0219032a7c191f89ce600110e81ecf7aea667255 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 4 Apr 2019 10:07:34 -0400 Subject: [PATCH 52/60] make dataselector selctor as selector instead of transformer --- tests/dataset_selector_tests.py | 11 +++++++++++ tpot/builtins/dataset_selector.py | 22 ++++++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/tests/dataset_selector_tests.py b/tests/dataset_selector_tests.py index d4c3b4b5..9f4c6ced 100644 --- a/tests/dataset_selector_tests.py +++ b/tests/dataset_selector_tests.py @@ -85,3 +85,14 @@ def test_DatasetSelector_5(): assert transformed_X.shape[1] != test_X.shape[1] assert transformed_X.shape[1] == 5 assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_DatasetSelector_6(): + """Assert that the _get_support_mask function returns correct mask.""" + ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") + ds.fit(test_X, y=None) + mask = ds._get_support_mask() + get_mask = ds.get_support() + + assert mask.shape[0] == 30 + assert np.count_nonzero(mask) == 5 + assert np.array_equal(get_mask, mask) diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index dd059834..e5e5adc2 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -24,10 +24,12 @@ import numpy as np import pandas as pd import os, os.path -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import BaseEstimator +from sklearn.feature_selection.base import SelectorMixin +from sklearn.utils.validation import check_is_fitted -class DatasetSelector(BaseEstimator, TransformerMixin): +class DatasetSelector(BaseEstimator, SelectorMixin): """Select predefined data subsets.""" @property @@ -130,3 +132,19 @@ def transform(self, X): X_transformed = X[:, self.feat_list_idx] return X_transformed.astype(np.float64) + + def _get_support_mask(self): + """ + Get the boolean mask indicating which features are selected + Returns + ------- + support : boolean array of shape [# input features] + An element is True iff its corresponding feature is selected for + retention. + """ + check_is_fitted(self, 'feat_list_idx') + n_features = len(self.feature_names) + mask = np.zeros(n_features, dtype=bool) + mask[np.asarray(self.feat_list_idx)] = True + + return mask From 4ba9a076dd45b61bae1271a687e19374f3320dc8 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 11 Apr 2019 09:38:53 -0400 Subject: [PATCH 53/60] rename DS to FeatureSetSelector --- tests/dataset_selector_tests.py | 26 +++++++++++++------------- tpot/base.py | 2 +- tpot/builtins/__init__.py | 2 +- tpot/builtins/dataset_selector.py | 8 ++++---- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/dataset_selector_tests.py b/tests/dataset_selector_tests.py index 9f4c6ced..a02a22a6 100644 --- a/tests/dataset_selector_tests.py +++ b/tests/dataset_selector_tests.py @@ -25,15 +25,15 @@ import numpy as np import pandas as pd -from tpot.builtins import DatasetSelector +from tpot.builtins import FeatureSetSelector test_data = pd.read_csv("tests/tests.csv") test_X = test_data.drop("class", axis=1) -def test_DatasetSelector_1(): +def test_FeatureSetSelector_1(): """Assert that the StackingEstimator returns transformed X based on test feature list 1.""" - ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) @@ -42,9 +42,9 @@ def test_DatasetSelector_1(): assert transformed_X.shape[1] == 5 assert np.array_equal(transformed_X, test_X[ds.feat_list].values) -def test_DatasetSelector_2(): +def test_FeatureSetSelector_2(): """Assert that the StackingEstimator returns transformed X based on test feature list 2.""" - ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_2") + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_2") ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) @@ -53,9 +53,9 @@ def test_DatasetSelector_2(): assert transformed_X.shape[1] == 6 assert np.array_equal(transformed_X, test_X[ds.feat_list].values) -def test_DatasetSelector_3(): +def test_FeatureSetSelector_3(): """Assert that the StackingEstimator returns transformed X based on 2 subsets' names""" - ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset=["test_subset_1", "test_subset_2"]) + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=["test_subset_1", "test_subset_2"]) ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) @@ -64,9 +64,9 @@ def test_DatasetSelector_3(): assert transformed_X.shape[1] == 7 assert np.array_equal(transformed_X, test_X[ds.feat_list].values) -def test_DatasetSelector_4(): +def test_FeatureSetSelector_4(): """Assert that the StackingEstimator returns transformed X based on 2 subsets' indexs""" - ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset=[0, 1]) + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=[0, 1]) ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) @@ -75,9 +75,9 @@ def test_DatasetSelector_4(): assert transformed_X.shape[1] == 7 assert np.array_equal(transformed_X, test_X[ds.feat_list].values) -def test_DatasetSelector_5(): +def test_FeatureSetSelector_5(): """Assert that the StackingEstimator returns transformed X seleced based on test feature list 1's index.""" - ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset=0) + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=0) ds.fit(test_X, y=None) transformed_X = ds.transform(test_X) @@ -86,9 +86,9 @@ def test_DatasetSelector_5(): assert transformed_X.shape[1] == 5 assert np.array_equal(transformed_X, test_X[ds.feat_list].values) -def test_DatasetSelector_6(): +def test_FeatureSetSelector_6(): """Assert that the _get_support_mask function returns correct mask.""" - ds = DatasetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") ds.fit(test_X, y=None) mask = ds._get_support_mask() get_mask = ds.get_support() diff --git a/tpot/base.py b/tpot/base.py index efcc2413..390e0792 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -451,7 +451,7 @@ def _add_operators(self): self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) elif main_type.count(step): # if the step is a main type for operator in self.operators: - if operator.__name__ != 'DatasetSelector': # dataset selector is not considered as a main type + if operator.__name__ != 'FeatureSetSelector': # dataset selector is not considered as a main type arg_types = operator.parameter_types()[0][1:] if operator.type() == step: p_types = ([step_in_type] + arg_types, step_ret_type) diff --git a/tpot/builtins/__init__.py b/tpot/builtins/__init__.py index e22d080f..1a7367f5 100644 --- a/tpot/builtins/__init__.py +++ b/tpot/builtins/__init__.py @@ -28,4 +28,4 @@ from .stacking_estimator import StackingEstimator from .one_hot_encoder import OneHotEncoder, auto_select_categorical_features, _transform_selected from .feature_transformers import CategoricalSelector, ContinuousSelector -from .dataset_selector import DatasetSelector +from .dataset_selector import FeatureSetSelector diff --git a/tpot/builtins/dataset_selector.py b/tpot/builtins/dataset_selector.py index e5e5adc2..428a60ac 100644 --- a/tpot/builtins/dataset_selector.py +++ b/tpot/builtins/dataset_selector.py @@ -29,8 +29,8 @@ from sklearn.utils.validation import check_is_fitted -class DatasetSelector(BaseEstimator, SelectorMixin): - """Select predefined data subsets.""" +class FeatureSetSelector(BaseEstimator, SelectorMixin): + """Select predefined feature subsets.""" @property def __name__(self): @@ -38,7 +38,7 @@ def __name__(self): return self.__class__.__name__ def __init__(self, subset_list, sel_subset): - """Create a DatasetSelector object. + """Create a FeatureSetSelector object. Parameters ---------- @@ -64,7 +64,7 @@ def __init__(self, subset_list, sel_subset): self.sel_subset = sel_subset def fit(self, X, y=None): - """Fit DatasetSelector for feature selection + """Fit FeatureSetSelector for feature selection Parameters ---------- From 08aee66dacfce29e1156b823269bf52add9aab70 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 11 Apr 2019 10:33:11 -0400 Subject: [PATCH 54/60] refine n_jobs parameter to support n_jobs < -2 #846 --- docs_sources/api.md | 4 ++-- docs_sources/using.md | 6 +++--- tests/tpot_tests.py | 19 +++++++++++++++++++ tpot/base.py | 21 +++++++++++++++------ tpot/driver.py | 3 ++- 5 files changed, 41 insertions(+), 12 deletions(-) diff --git a/docs_sources/api.md b/docs_sources/api.md index 55c95646..4ffecb2b 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -107,7 +107,7 @@ Setting subsample=0.5 tells TPOT to use a random subsample of half of t
Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process.

-Setting n_jobs=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets +Setting n_jobs=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets.
max_time_mins: integer or None, optional (default=None) @@ -590,7 +590,7 @@ Setting subsample=0.5 tells TPOT to use a random subsample of half of t
Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process.

-Setting n_jobs=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets +Setting n_jobs=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets
max_time_mins: integer or None, optional (default=None) diff --git a/docs_sources/using.md b/docs_sources/using.md index c47ee7ab..2a8c27da 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -240,7 +240,7 @@ See the section on
scoring functions for more d Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process.

-Assigning this to -1 will use as many cores as available on the computer. +Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime @@ -589,10 +589,10 @@ For large problems or working on Jupyter notebook, we highly recommend that you The [dask-examples binder](https://mybinder.org/v2/gh/dask/dask-examples/master?filepath=machine-learning%2Ftpot.ipynb) has a runnable example with a small dask cluster. -To use your Dask cluster to fit a TPOT model, specify the ``use_dask`` keyword when you create the TPOT estimator. **Note: if `use_dask=True`, TPOT will use as many cores as available on the your Dask cluster regardless of whether `n_jobs` is specified.** +To use your Dask cluster to fit a TPOT model, specify the ``use_dask`` keyword when you create the TPOT estimator. **Note: if `use_dask=True`, TPOT will use as many cores as available on the your Dask cluster. If `n_jobs` is specified, then it will control the chunk size (10*`n_jobs` if it is less then offspring size) of parallel training. ** ```python -estimator = TPOTEstimator(use_dask=True) +estimator = TPOTEstimator(use_dask=True, n_jobs=-1) ``` This will use use all the workers on your cluster to do the training, and use [Dask-ML's pipeline rewriting](https://dask-ml.readthedocs.io/en/latest/hyper-parameter-search.html#avoid-repeated-work) to avoid re-fitting estimators multiple times on the same set of data. diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index d4091d86..d2fb4d80 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -283,6 +283,8 @@ def test_init_n_jobs(): """Assert that the TPOT init stores current number of processes.""" tpot_obj = TPOTClassifier(n_jobs=2) assert tpot_obj.n_jobs == 2 + tpot_obj._fit_init() + assert tpot_obj._n_jobs == 2 tpot_obj = TPOTClassifier(n_jobs=-1) assert tpot_obj.n_jobs == -1 @@ -290,6 +292,23 @@ def test_init_n_jobs(): assert tpot_obj._n_jobs == cpu_count() +def test_init_n_jobs_2(): + """Assert that the TPOT init assign right""" + tpot_obj = TPOTClassifier(n_jobs=-2) + assert tpot_obj.n_jobs == -2 + + tpot_obj._fit_init() + assert tpot_obj._n_jobs == cpu_count() - 1 + + +def test_init_n_jobs_3(): + """Assert that the TPOT init rasies ValueError if n_jobs=0.""" + tpot_obj = TPOTClassifier(n_jobs=0) + assert tpot_obj.n_jobs == 0 + + assert_raises(ValueError, tpot_obj._fit_init) + + def test_timeout(): """Assert that _wrapped_cross_val_score return Timeout in a time limit.""" tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error') diff --git a/tpot/base.py b/tpot/base.py index 390e0792..11375b56 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -173,7 +173,8 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, n_jobs: int, optional (default: 1) Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available - on the computer. + on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. + Thus for n_jobs = -2, all CPUs but one are used. max_time_mins: int, optional (default: None) How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the "generations" parameter and allow @@ -205,7 +206,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, TPOT uses a configuration dictionary with a one-hot-encoder and the operators normally included in TPOT that also support sparse matrices. template: string (default: 'RandomTree') - A template for pipeline structure + A template for predefined pipeline structure. warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). @@ -596,8 +597,12 @@ def _fit_init(self): 'The subsample ratio of the training instance must be in the range (0.0, 1.0].' ) - if self.n_jobs == -1: - self._n_jobs = cpu_count() + if self.n_jobs == 0: + raise ValueError( + 'The value 0 of n_jobs is invalid.' + ) + elif self.n_jobs < 0: + self._n_jobs = cpu_count() + 1 + self.n_jobs else: self._n_jobs = self.n_jobs @@ -1321,8 +1326,12 @@ def _evaluate_individuals(self, population, features, target, sample_weight=None result_score_list = self._update_val(val, result_score_list) else: # chunk size for pbar update - # chunk size is min of cpu_count * 2 and n_jobs * 4 - chunk_size = min(cpu_count()*2, self._n_jobs*4) + if self.use_dask: + # chunk size is min of _lambda and n_jobs * 10 + chunk_size = min(self._lambda, self._n_jobs*10) + else: + # chunk size is min of cpu_count * 2 and n_jobs * 4 + chunk_size = min(cpu_count()*2, self._n_jobs*4) for chunk_idx in range(0, len(sklearn_pipeline_list), chunk_size): self._stop_by_max_time_mins() if self.use_dask: diff --git a/tpot/driver.py b/tpot/driver.py index d854d183..88d6b550 100755 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -296,7 +296,8 @@ def _get_arg_parser(): help=( 'Number of CPUs for evaluating pipelines in parallel during the ' 'TPOT optimization process. Assigning this to -1 will use as many ' - 'cores as available on the computer.' + 'cores as available on the computer. For n_jobs below -1, ' + '(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.' ) ) From a2264b0f43e7ac980430c53e247b2f446b1ff180 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 11 Apr 2019 13:54:23 -0400 Subject: [PATCH 55/60] refine template docs --- tpot/base.py | 25 ++++++++++++++++--------- tpot/driver.py | 10 +++++++++- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index 11375b56..f96ffcab 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -205,8 +205,16 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, String 'TPOT sparse': TPOT uses a configuration dictionary with a one-hot-encoder and the operators normally included in TPOT that also support sparse matrices. - template: string (default: 'RandomTree') - A template for predefined pipeline structure. + template: string (default: "RandomTree") + Template of predefined pipeline structure. The option specify a desired structure + for the machine learning pipeline evaluated in tpot. So far this option only supports + linear pipeline structure. Each step in the pipeline should be a main class of operators + (Selector, Transformer, Classifier or Regressor) or a specific operator + (e.g. SelectPercentile) defined in TPOT operator configuration. If one step is a main class, + TPOT will randomly assign all subclass operators (subclasses of SelectorMixin, + TransformerMixin, ClassifierMixin or RegressorMixin in scikit-learn) to that step. + Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Classifier". + By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). @@ -416,7 +424,7 @@ def _add_operators(self): main_type = ["Classifier", "Regressor", "Selector", "Transformer"] ret_types = [] self.op_list = [] - if self.template == "RandomTree": # default + if self.template == "RandomTree": # default pipeline structure step_in_type = np.ndarray step_ret_type = Output_Array for operator in self.operators: @@ -452,12 +460,11 @@ def _add_operators(self): self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) elif main_type.count(step): # if the step is a main type for operator in self.operators: - if operator.__name__ != 'FeatureSetSelector': # dataset selector is not considered as a main type - arg_types = operator.parameter_types()[0][1:] - if operator.type() == step: - p_types = ([step_in_type] + arg_types, step_ret_type) - self._pset.addPrimitive(operator, *p_types) - self._import_hash_and_add_terminals(operator, arg_types) + arg_types = operator.parameter_types()[0][1:] + if operator.type() == step: + p_types = ([step_in_type] + arg_types, step_ret_type) + self._pset.addPrimitive(operator, *p_types) + self._import_hash_and_add_terminals(operator, arg_types) else: # is the step is a specific operator for operator in self.operators: arg_types = operator.parameter_types()[0][1:] diff --git a/tpot/driver.py b/tpot/driver.py index 88d6b550..79972fca 100755 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -362,7 +362,15 @@ def _get_arg_parser(): default='RandomTree', type=str, help=( - 'Template for pipeline structure' + 'Template of predefined pipeline structure. The option specify a desired structure' + 'for the machine learning pipeline evaluated in tpot. So far this option only supports' + 'linear pipeline structure. Each step in the pipeline should be a main class of operators' + '(Selector, Transformer, Classifier or Regressor) or a specific operator' + '(e.g. SelectPercentile) defined in TPOT operator configuration. If one step is a main class,' + 'TPOT will randomly assign all subclass operators (subclasses of SelectorMixin,' + 'TransformerMixin, ClassifierMixin or RegressorMixin in scikit-learn) to that step.' + 'Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Classifier".' + 'By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly.' ) ) From 7823ab597368b2d8608f4bcc2451eed9c1aea9ff Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 11 Apr 2019 13:54:39 -0400 Subject: [PATCH 56/60] add teimplate examples --- docs_sources/api.md | 20 ++++++++++++++++++++ docs_sources/using.md | 20 +++++++++++++++++--- tpot/base.py | 4 ++-- tpot/driver.py | 4 ++-- 4 files changed, 41 insertions(+), 7 deletions(-) diff --git a/docs_sources/api.md b/docs_sources/api.md index 4ffecb2b..1613e69c 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -7,6 +7,7 @@ subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, + template="RandomTree", warm_start=False, memory=None, use_dask=False, @@ -146,6 +147,15 @@ Possible inputs are: See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. +template: string (default="RandomTree") +
+Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. +

+So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. +
+ warm_start: boolean, optional (default=False)
Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). @@ -489,6 +499,7 @@ Does not return anything subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, + template="RandomTree", warm_start=False, memory=None, use_dask=False, @@ -629,6 +640,15 @@ Possible inputs are: See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations.
+template: string (default="RandomTree") +
+Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. +

+So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Regressor". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. +
+ warm_start: boolean, optional (default=False)
Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). diff --git a/docs_sources/using.md b/docs_sources/using.md index 2a8c27da..46af49a5 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -285,9 +285,7 @@ See the built-in configurations -template TEMPLATE String -Template for pipeline structure. -

-For example: -template Selector-Transformer-Classifier +Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. -memory @@ -534,6 +532,22 @@ For more detailed examples of how to customize TPOT's operator configuration, se Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers. + +# Template option in TPOT + +Template option is added into TPOT since v0.10.0. It provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. + +Below is a simple example to use `template` option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17)), 2nd step is a feature transformer (a subclass of [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html)) and 3rd step is a classifier for classification (a subclass of [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html)). The last step must be `Classifier` for `TPOTClassifier`'s template but `Regressor` for `TPOTRegressor`. **Note: although `SelectorMixin` is subclass of `TransformerMixin` in scikit-leawrn, but `Transformer` in this option excludes those subclasses of `SelectorMixin`.** + +```Python +tpot_obj = TPOTClassifier( + template='Selector-Transformer-Classifier' + ) +``` + +If a specific operator, e.g. `SelectPercentile`, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'. + + # Pipeline caching in TPOT With the `memory` parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or [`sklearn.external.joblib.Memory`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/externals/joblib/memory.py#L847) in case they want to re-use the memory cache in future TPOT runs (or a `warm_start` run). diff --git a/tpot/base.py b/tpot/base.py index f96ffcab..9c007c24 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -206,8 +206,8 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, TPOT uses a configuration dictionary with a one-hot-encoder and the operators normally included in TPOT that also support sparse matrices. template: string (default: "RandomTree") - Template of predefined pipeline structure. The option specify a desired structure - for the machine learning pipeline evaluated in tpot. So far this option only supports + Template of predefined pipeline structure. The option is for specifying a desired structure + for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. SelectPercentile) defined in TPOT operator configuration. If one step is a main class, diff --git a/tpot/driver.py b/tpot/driver.py index 79972fca..cb4b8ef2 100755 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -362,8 +362,8 @@ def _get_arg_parser(): default='RandomTree', type=str, help=( - 'Template of predefined pipeline structure. The option specify a desired structure' - 'for the machine learning pipeline evaluated in tpot. So far this option only supports' + 'Template of predefined pipeline structure. The option is for specifying a desired structure' + 'for the machine learning pipeline evaluated in TPOT. So far this option only supports' 'linear pipeline structure. Each step in the pipeline should be a main class of operators' '(Selector, Transformer, Classifier or Regressor) or a specific operator' '(e.g. SelectPercentile) defined in TPOT operator configuration. If one step is a main class,' From 83c641da91d205cef403b980633fc4a31726f48d Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 11 Apr 2019 14:18:40 -0400 Subject: [PATCH 57/60] add FFS example --- docs_sources/using.md | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/docs_sources/using.md b/docs_sources/using.md index 46af49a5..dbe9143e 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -285,7 +285,10 @@ See the
built-in configurations -template TEMPLATE String -Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. +Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. + -memory @@ -535,7 +538,7 @@ Note that you must have all of the corresponding packages for the operators inst # Template option in TPOT -Template option is added into TPOT since v0.10.0. It provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. +Template option is added into TPOT v0.10.0. It provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use `template` option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17)), 2nd step is a feature transformer (a subclass of [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html)) and 3rd step is a classifier for classification (a subclass of [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html)). The last step must be `Classifier` for `TPOTClassifier`'s template but `Regressor` for `TPOTRegressor`. **Note: although `SelectorMixin` is subclass of `TransformerMixin` in scikit-leawrn, but `Transformer` in this option excludes those subclasses of `SelectorMixin`.** @@ -548,6 +551,35 @@ tpot_obj = TPOTClassifier( If a specific operator, e.g. `SelectPercentile`, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'. +# FeatureSetSelector in TPOT + +`FeatureSetSelector` is a special new operator in TPOT. This operator enables feature selection based on *priori* export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ([MSigDB](http://software.broadinstitute.org/gsea/msigdb/index.jsp)) to reduce dimensions and saving computing time. Below is a example for using this operator in TPOT. + +```Python +from tpot import TPOTClassifier +import numpy as np +import pandas as pd +from tpot.config import classifier_config_dict +test_data = pd.read_csv("https://raw.githubusercontent.com/weixuanfu/tpot/master/tests/tests.csv") +test_X = test_data.drop("class", axis=1) +test_y = test_data['class'] + +# add FeatureSetSelector into tpot configuration +classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { + 'subset_list': ['https://raw.githubusercontent.com/weixuanfu/tpot/master/tests/subset_test.csv'], + 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above + #'sel_subset': list(combinations(range(3), 2)) # select two feature sets +} + + +tpot = TPOTClassifier(generations=5, + population_size=50, verbosity=2, + template='FeatureSetSelector-Transformer-Classifier', + config_dict=classifier_config_dict) +tpot.fit(test_X, test_y) +``` + + # Pipeline caching in TPOT With the `memory` parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or [`sklearn.external.joblib.Memory`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/externals/joblib/memory.py#L847) in case they want to re-use the memory cache in future TPOT runs (or a `warm_start` run). From 909981e068399c7986ad49162dd628096b1367f2 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 11 Apr 2019 14:30:22 -0400 Subject: [PATCH 58/60] generate html pages --- docs/api/index.html | 24 +++++++++++++-- docs/index.html | 2 +- docs/search/search_index.json | 22 ++++++++++---- docs/sitemap.xml | 20 ++++++------- docs/using/index.html | 55 +++++++++++++++++++++++++++++++++-- docs_sources/using.md | 8 ++--- 6 files changed, 105 insertions(+), 26 deletions(-) diff --git a/docs/api/index.html b/docs/api/index.html index 38be9290..18f93e12 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -149,6 +149,7 @@

Classification

subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, + template="RandomTree", warm_start=False, memory=None, use_dask=False, @@ -246,7 +247,7 @@

Classification

Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process.

-Setting n_jobs=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets +Setting n_jobs=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets.
max_time_mins: integer or None, optional (default=None) @@ -285,6 +286,15 @@

Classification

See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations.
+template: string (default="RandomTree") +
+Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. +

+So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. +
+ warm_start: boolean, optional (default=False)
Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). @@ -611,6 +621,7 @@

Regression

subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, + template="RandomTree", warm_start=False, memory=None, use_dask=False, @@ -709,7 +720,7 @@

Regression

Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process.

-Setting n_jobs=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets +Setting n_jobs=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets
max_time_mins: integer or None, optional (default=None) @@ -748,6 +759,15 @@

Regression

See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations.
+template: string (default="RandomTree") +
+Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. +

+So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Regressor". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. +
+ warm_start: boolean, optional (default=False)
Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). diff --git a/docs/index.html b/docs/index.html index 014d4776..437e5eae 100644 --- a/docs/index.html +++ b/docs/index.html @@ -213,5 +213,5 @@ diff --git a/docs/search/search_index.json b/docs/search/search_index.json index 31701985..183bf6ca 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -12,7 +12,7 @@ }, { "location": "/using/", - "text": "What to expect from AutoML software\n\n\nAutomated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to,\nso we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.\n\n\nAutoML algorithms aren't intended to run for only a few minutes\n\n\n\nOf course, you \ncan\n run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset.\nHowever, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not\nfind any suitable pipeline at all, in which case a \nRuntimeError('A pipeline has not yet been optimized. Please call fit() first.')\n\nwill be raised.\nOften it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search\nthe pipeline space for your dataset.\n\n\nAutoML algorithms can take a long time to finish their search\n\n\n\nAutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms\n(random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling,\nPCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways\nto ensemble or stack the algorithms within the pipeline.\n\n\nAs such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings\n(100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing.\nTo put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm\nand how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation,\nwhich means that roughly 100,000 models are fit and evaluated on the training data in one grid search.\nThat's a time-consuming procedure, even for simpler models like decision trees.\n\n\nTypical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt\nthe run partway through and see the best results so far. TPOT also \nprovides\n a \nwarm_start\n parameter that\nlets you restart a TPOT run from where it left off.\n\n\nAutoML algorithms can recommend different solutions for the same dataset\n\n\n\nIf you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs\nmay result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means\nthat it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different\npipelines, this means that the TPOT runs didn't converge due to lack of time \nor\n that multiple pipelines\nperform more-or-less the same on your dataset.\n\n\nThis is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives\nyou ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you\nmight have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such\nas grid search.\n\n\nTPOT with code\n\n\nWe've taken care to design the TPOT interface to be as similar as possible to scikit-learn.\n\n\nTPOT can be imported just like any regular Python module. To import TPOT, type:\n\n\nfrom tpot import TPOTClassifier\n\n\n\n\nthen create an instance of TPOT as follows:\n\n\npipeline_optimizer = TPOTClassifier()\n\n\n\n\nIt's also possible to use TPOT for regression problems with the \nTPOTRegressor\n class. Other than the class name,\na \nTPOTRegressor\n is used the same way as a \nTPOTClassifier\n. You can read more about the \nTPOTClassifier\n and \nTPOTRegressor\n classes in the \nAPI documentation\n.\n\n\nSome example code with custom TPOT parameters might look like:\n\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\n\n\n\n\nNow TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the \nfit\n function:\n\n\npipeline_optimizer.fit(X_train, y_train)\n\n\n\n\nThe \nfit\n function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation\nThen, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model.\n\n\nYou can then proceed to evaluate the final pipeline on the testing set with the \nscore\n function:\n\n\nprint(pipeline_optimizer.score(X_test, y_test))\n\n\n\n\nFinally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the \nexport\n function:\n\n\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nOnce this code finishes running, \ntpot_exported_pipeline.py\n will contain the Python code for the optimized pipeline.\n\n\nBelow is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file.\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\npipeline_optimizer.fit(X_train, y_train)\nprint(pipeline_optimizer.score(X_test, y_test))\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nCheck our \nexamples\n to see TPOT applied to some specific data sets.\n\n\nTPOT on the command line\n\n\nTo use TPOT via the command line, enter the following command with a path to the data file:\n\n\ntpot /path_to/data_file.csv\n\n\n\n\nAn example command-line call to TPOT may look like:\n\n\ntpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2\n\n\n\n\nTPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command:\n\n\ntpot --help\n\n\n\n\nDetailed descriptions of the command-line arguments are below.\n\n\n\n\n\n\nArgument\n\n\nParameter\n\n\nValid values\n\n\nEffect\n\n\n\n\n\n\n-is\n\n\nINPUT_SEPARATOR\n\n\nAny string\n\n\nCharacter used to separate columns in the input file.\n\n\n\n\n\n\n-target\n\n\nTARGET_NAME\n\n\nAny string\n\n\nName of the target column in the input file.\n\n\n\n\n\n\n-mode\n\n\nTPOT_MODE\n\n\n['classification', 'regression']\n\n\nWhether TPOT is being used for a supervised classification or regression problem.\n\n\n\n\n\n\n-o\n\n\nOUTPUT_FILE\n\n\nString path to a file\n\n\nFile to export the code for the final optimized pipeline.\n\n\n\n\n\n\n-g\n\n\nGENERATIONS\n\n\nAny positive integer\n\n\nNumber of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-p\n\n\nPOPULATION_SIZE\n\n\nAny positive integer\n\n\nNumber of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-os\n\n\nOFFSPRING_SIZE\n\n\nAny positive integer\n\n\nNumber of offspring to produce in each GP generation.\n\n\nBy default, OFFSPRING_SIZE = POPULATION_SIZE.\n\n\n\n\n\n\n-mr\n\n\nMUTATION_RATE\n\n\n[0.0, 1.0]\n\n\nGP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\n\n\n-xr\n\n\nCROSSOVER_RATE\n\n\n[0.0, 1.0]\n\n\nGP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\n\n\n-scoring\n\n\nSCORING_FN\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',\n'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted',\n'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*'\n\n\nFunction used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized.\n\n\nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\n\n-cv\n\n\nCV\n\n\nAny integer > 1\n\n\nNumber of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.\n\n\n\n\n-sub\n\n\nSUBSAMPLE\n\n\n(0.0, 1.0]\n\n\nSubsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.\n\n\n\n\n\n\n-njobs\n\n\nNUM_JOBS\n\n\nAny positive integer or -1\n\n\nNumber of CPUs for evaluating pipelines in parallel during the TPOT optimization process.\n\n\nAssigning this to -1 will use as many cores as available on the computer.\n\n\n\n\n\n\n-maxtime\n\n\nMAX_TIME_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time.\n\n\n\n\n\n\n-maxeval\n\n\nMAX_EVAL_MINS\n\n\nAny positive float\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer.\n\n\n\n\n\n\n-s\n\n\nRANDOM_STATE\n\n\nAny positive integer\n\n\nRandom number generator seed for reproducibility.\n\n\nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future.\n\n\n\n\n\n\n-config\n\n\nCONFIG_FILE\n\n\nString or file path\n\n\nOperators and parameter configurations in TPOT:\n\n\n\n\n\nPath for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\n\n\n\n-memory\n\n\nMEMORY\n\n\nString or file path\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT:\n\n\n\n\n\nPath for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown.\n\n\nstring 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown.\n\n\n\n\n\n\n\n\n\n\n-cf\n\n\nCHECKPOINT_FOLDER\n\n\nFolder path\n\n\n\nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing.\n\n\nThis is useful in multiple cases:\n\n\n\nsudden death before tpot could save an optimized pipeline\n\n\nprogress tracking\n\n\ngrabbing a pipeline while tpot is working\n\n\n\n\n\nExample:\n\n\nmkdir my_checkpoints\n\n\n-cf ./my_checkpoints\n\n\n\n\n\n-es\n\n\nEARLY_STOP\n\n\nAny positive integer\n\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnd optimization process if there is no improvement in the set number of generations.\n\n\n\n\n\n-v\n\n\nVERBOSITY\n\n\n{0, 1, 2, 3}\n\n\nHow much information TPOT communicates while it is running.\n\n\n0 = none, 1 = minimal, 2 = high, 3 = all.\n\n\nA setting of 2 or higher will add a progress bar during the optimization procedure.\n\n\n\n\n\n\n--no-update-check\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\n\n\n\n\n--version\n\n\nShow TPOT's version number and exit.\n\n\n\n\n\n\n--help\n\n\nShow TPOT's help documentation and exit.\n\n\n\n\n\n\n\nScoring functions\n\n\nTPOT makes use of \nsklearn.model_selection.cross_val_score\n for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT:\n\n\n\n\n\n\nYou can pass in a string to the \nscoring\n parameter from the list above. Any other strings will cause TPOT to throw an exception.\n\n\n\n\n\n\nYou can pass the callable object/function with signature \nscorer(estimator, X, y)\n, where \nestimator\n is trained estimator to use for scoring, \nX\n are features that will be passed to \nestimator.predict\n and \ny\n are target values for \nX\n. To do this, you should implement your own function. See the example below for further explanation.\n\n\n\n\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics.scorer import make_scorer\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n# Make a custom metric function\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\n# Make a custom a scorer from the custom metric function\n# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.\nmy_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_scorer)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\n\n\n\nYou can pass a metric function with the signature \nscore_func(y_true, y_pred)\n (e.g. \nmy_custom_accuracy\n in the example above), where \ny_true\n are the true target values and \ny_pred\n are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized (\ngreater_is_better=False\n in \nmake_scorer\n), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\n\n\n\n\nmy_module.scorer_name\n: You can also use a custom \nscore_func(y_true, y_pred)\n or \nscorer(estimator, X, y)\n function through the command line by adding the argument \n-scoring my_module.scorer\n to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT.\nExample: \n-scoring sklearn.metrics.auc\n will use the function auc from sklearn.metrics module.\n\n\n\n\n\n\nBuilt-in TPOT configurations\n\n\nTPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT.\n\n\n\n\n\n\nConfiguration Name\n\n\nDescription\n\n\nOperators\n\n\n\n\n\n\n\nDefault TPOT\n\n\nTPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets.\n\n\n\nNote: This is the default configuration for TPOT.\n To use this configuration, use the default value (None) for the config_dict parameter.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT light\n\n\nTPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT MDR\n\n\nTPOT will search over a series of feature selectors and \nMultifactor Dimensionality Reduction\n models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for \ngenome-wide association studies (GWAS)\n, and is described in detail online \nhere\n.\n\n\nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT sparse\n\n\nTPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\n\nTo use any of these configurations, simply pass the string name of the configuration to the \nconfig_dict\n parameter (or \n-config\n on the command line). For example, to use the \"TPOT light\" configuration:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\nCustomizing TPOT's operators and parameters\n\n\nBeyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters.\n\n\nThe custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., \nsklearn.naive_bayes.MultinomialNB\n) and the second level key is the corresponding parameter name for that operator (e.g., \nfit_prior\n). The second level key should point to a list of parameter values for that parameter, e.g., \n'fit_prior': [True, False]\n.\n\n\nFor a simple example, the configuration could be:\n\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\n\n\n\nin which case TPOT would only consider pipelines containing \nGaussianNB\n, \nBernoulliNB\n, \nMultinomialNB\n, and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the \nTPOTClassifier\n/\nTPOTRegressor\n \nconfig_dict\n parameter, described above. For example:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nCommand-line users must create a separate \n.py\n file with the custom configuration and provide the path to the file to the \ntpot\n call. For example, if the simple example configuration above is saved in \ntpot_classifier_config.py\n, that configuration could be used on the command line with the command:\n\n\ntpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py\n\n\n\n\nWhen using the command-line interface, the configuration file specified in the \n-config\n parameter \nmust\n name its custom TPOT configuration \ntpot_config\n. Otherwise, TPOT will not be able to locate the configuration dictionary.\n\n\nFor more detailed examples of how to customize TPOT's operator configuration, see the default configurations for \nclassification\n and \nregression\n in TPOT's source code.\n\n\nNote that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.\n\n\nPipeline caching in TPOT\n\n\nWith the \nmemory\n parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or \nsklearn.external.joblib.Memory\n in case they want to re-use the memory cache in future TPOT runs (or a \nwarm_start\n run).\n\n\nThere are three methods for enabling memory caching in TPOT:\n\n\nfrom tpot import TPOTClassifier\nfrom tempfile import mkdtemp\nfrom sklearn.externals.joblib import Memory\nfrom shutil import rmtree\n\n# Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown\ntpot = TPOTClassifier(memory='auto')\n\n# Method 2, with a custom directory for memory caching\ntpot = TPOTClassifier(memory='/to/your/path')\n\n# Method 3, with a Memory object\ncachedir = mkdtemp() # Create a temporary folder\nmemory = Memory(cachedir=cachedir, verbose=0)\ntpot = TPOTClassifier(memory=memory)\n\n# Clear the cache directory when you don't need it anymore\nrmtree(cachedir)\n\n\n\n\nNote: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.\n\n\nCrash/freeze issue with n_jobs > 1 under OSX or Linux\n\n\nInternally, TPOT uses \njoblib\n to fit estimators in parallel.\nThis is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux \nas scikit-learn does\n, especially with large datasets.\n\n\nOne solution is to configure Python's \nmultiprocessing\n module to use the \nforkserver\n start method (instead of the default \nfork\n) to manage the process pools. You can enable the \nforkserver\n mode globally for your program by putting the following codes into your main script:\n\n\nimport multiprocessing\n\n# other imports, custom code, load data, define model...\n\nif __name__ == '__main__':\n multiprocessing.set_start_method('forkserver')\n\n # call scikit-learn utils or tpot utils with n_jobs > 1 here\n\n\n\n\nMore information about these start methods can be found in the \nmultiprocessing documentation\n.\n\n\nParallel Training with Dask\n\n\nFor large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a \nDask\n cluster.\nThe \ndask-examples binder\n has a runnable example\nwith a small dask cluster.\n\n\nTo use your Dask cluster to fit a TPOT model, specify the \nuse_dask\n keyword when you create the TPOT estimator. \nNote: if \nuse_dask=True\n, TPOT will use as many cores as available on the your Dask cluster regardless of whether \nn_jobs\n is specified.\n\n\nestimator = TPOTEstimator(use_dask=True)\n\n\n\n\nThis will use use all the workers on your cluster to do the training, and use \nDask-ML's pipeline rewriting\n to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the \ndistributed scheduler UI\n.\n\n\nAlternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a \njoblib.parallel_backend\n:\n\n\nfrom sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y)\n\n\n\n\nSee \ndask's distributed joblib integration\n for more.", + "text": "What to expect from AutoML software\n\n\nAutomated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to,\nso we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.\n\n\nAutoML algorithms aren't intended to run for only a few minutes\n\n\n\nOf course, you \ncan\n run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset.\nHowever, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not\nfind any suitable pipeline at all, in which case a \nRuntimeError('A pipeline has not yet been optimized. Please call fit() first.')\n\nwill be raised.\nOften it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search\nthe pipeline space for your dataset.\n\n\nAutoML algorithms can take a long time to finish their search\n\n\n\nAutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms\n(random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling,\nPCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways\nto ensemble or stack the algorithms within the pipeline.\n\n\nAs such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings\n(100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing.\nTo put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm\nand how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation,\nwhich means that roughly 100,000 models are fit and evaluated on the training data in one grid search.\nThat's a time-consuming procedure, even for simpler models like decision trees.\n\n\nTypical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt\nthe run partway through and see the best results so far. TPOT also \nprovides\n a \nwarm_start\n parameter that\nlets you restart a TPOT run from where it left off.\n\n\nAutoML algorithms can recommend different solutions for the same dataset\n\n\n\nIf you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs\nmay result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means\nthat it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different\npipelines, this means that the TPOT runs didn't converge due to lack of time \nor\n that multiple pipelines\nperform more-or-less the same on your dataset.\n\n\nThis is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives\nyou ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you\nmight have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such\nas grid search.\n\n\nTPOT with code\n\n\nWe've taken care to design the TPOT interface to be as similar as possible to scikit-learn.\n\n\nTPOT can be imported just like any regular Python module. To import TPOT, type:\n\n\nfrom tpot import TPOTClassifier\n\n\n\n\nthen create an instance of TPOT as follows:\n\n\npipeline_optimizer = TPOTClassifier()\n\n\n\n\nIt's also possible to use TPOT for regression problems with the \nTPOTRegressor\n class. Other than the class name,\na \nTPOTRegressor\n is used the same way as a \nTPOTClassifier\n. You can read more about the \nTPOTClassifier\n and \nTPOTRegressor\n classes in the \nAPI documentation\n.\n\n\nSome example code with custom TPOT parameters might look like:\n\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\n\n\n\n\nNow TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the \nfit\n function:\n\n\npipeline_optimizer.fit(X_train, y_train)\n\n\n\n\nThe \nfit\n function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation\nThen, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model.\n\n\nYou can then proceed to evaluate the final pipeline on the testing set with the \nscore\n function:\n\n\nprint(pipeline_optimizer.score(X_test, y_test))\n\n\n\n\nFinally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the \nexport\n function:\n\n\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nOnce this code finishes running, \ntpot_exported_pipeline.py\n will contain the Python code for the optimized pipeline.\n\n\nBelow is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file.\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\npipeline_optimizer.fit(X_train, y_train)\nprint(pipeline_optimizer.score(X_test, y_test))\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nCheck our \nexamples\n to see TPOT applied to some specific data sets.\n\n\nTPOT on the command line\n\n\nTo use TPOT via the command line, enter the following command with a path to the data file:\n\n\ntpot /path_to/data_file.csv\n\n\n\n\nAn example command-line call to TPOT may look like:\n\n\ntpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2\n\n\n\n\nTPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command:\n\n\ntpot --help\n\n\n\n\nDetailed descriptions of the command-line arguments are below.\n\n\n\n\n\n\nArgument\n\n\nParameter\n\n\nValid values\n\n\nEffect\n\n\n\n\n\n\n-is\n\n\nINPUT_SEPARATOR\n\n\nAny string\n\n\nCharacter used to separate columns in the input file.\n\n\n\n\n\n\n-target\n\n\nTARGET_NAME\n\n\nAny string\n\n\nName of the target column in the input file.\n\n\n\n\n\n\n-mode\n\n\nTPOT_MODE\n\n\n['classification', 'regression']\n\n\nWhether TPOT is being used for a supervised classification or regression problem.\n\n\n\n\n\n\n-o\n\n\nOUTPUT_FILE\n\n\nString path to a file\n\n\nFile to export the code for the final optimized pipeline.\n\n\n\n\n\n\n-g\n\n\nGENERATIONS\n\n\nAny positive integer\n\n\nNumber of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-p\n\n\nPOPULATION_SIZE\n\n\nAny positive integer\n\n\nNumber of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-os\n\n\nOFFSPRING_SIZE\n\n\nAny positive integer\n\n\nNumber of offspring to produce in each GP generation.\n\n\nBy default, OFFSPRING_SIZE = POPULATION_SIZE.\n\n\n\n\n\n\n-mr\n\n\nMUTATION_RATE\n\n\n[0.0, 1.0]\n\n\nGP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\n\n\n-xr\n\n\nCROSSOVER_RATE\n\n\n[0.0, 1.0]\n\n\nGP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\n\n\n-scoring\n\n\nSCORING_FN\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',\n'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted',\n'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*'\n\n\nFunction used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized.\n\n\nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\n\n-cv\n\n\nCV\n\n\nAny integer > 1\n\n\nNumber of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.\n\n\n\n\n-sub\n\n\nSUBSAMPLE\n\n\n(0.0, 1.0]\n\n\nSubsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.\n\n\n\n\n\n\n-njobs\n\n\nNUM_JOBS\n\n\nAny positive integer or -1\n\n\nNumber of CPUs for evaluating pipelines in parallel during the TPOT optimization process.\n\n\nAssigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.\n\n\n\n\n\n\n-maxtime\n\n\nMAX_TIME_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time.\n\n\n\n\n\n\n-maxeval\n\n\nMAX_EVAL_MINS\n\n\nAny positive float\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer.\n\n\n\n\n\n\n-s\n\n\nRANDOM_STATE\n\n\nAny positive integer\n\n\nRandom number generator seed for reproducibility.\n\n\nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future.\n\n\n\n\n\n\n-config\n\n\nCONFIG_FILE\n\n\nString or file path\n\n\nOperators and parameter configurations in TPOT:\n\n\n\n\n\nPath for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\n\n\n\n-template\n\n\nTEMPLATE\n\n\nString\n\n\nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the \n template option in tpot\n section for more details.\n\n\n\n\n\n\n\n-memory\n\n\nMEMORY\n\n\nString or file path\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT:\n\n\n\n\n\nPath for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown.\n\n\nstring 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown.\n\n\n\n\n\n\n\n\n\n\n-cf\n\n\nCHECKPOINT_FOLDER\n\n\nFolder path\n\n\n\nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing.\n\n\nThis is useful in multiple cases:\n\n\n\nsudden death before tpot could save an optimized pipeline\n\n\nprogress tracking\n\n\ngrabbing a pipeline while tpot is working\n\n\n\n\n\nExample:\n\n\nmkdir my_checkpoints\n\n\n-cf ./my_checkpoints\n\n\n\n\n\n-es\n\n\nEARLY_STOP\n\n\nAny positive integer\n\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnd optimization process if there is no improvement in the set number of generations.\n\n\n\n\n\n-v\n\n\nVERBOSITY\n\n\n{0, 1, 2, 3}\n\n\nHow much information TPOT communicates while it is running.\n\n\n0 = none, 1 = minimal, 2 = high, 3 = all.\n\n\nA setting of 2 or higher will add a progress bar during the optimization procedure.\n\n\n\n\n\n\n--no-update-check\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\n\n\n\n\n--version\n\n\nShow TPOT's version number and exit.\n\n\n\n\n\n\n--help\n\n\nShow TPOT's help documentation and exit.\n\n\n\n\n\n\n\nScoring functions\n\n\nTPOT makes use of \nsklearn.model_selection.cross_val_score\n for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT:\n\n\n\n\n\n\nYou can pass in a string to the \nscoring\n parameter from the list above. Any other strings will cause TPOT to throw an exception.\n\n\n\n\n\n\nYou can pass the callable object/function with signature \nscorer(estimator, X, y)\n, where \nestimator\n is trained estimator to use for scoring, \nX\n are features that will be passed to \nestimator.predict\n and \ny\n are target values for \nX\n. To do this, you should implement your own function. See the example below for further explanation.\n\n\n\n\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics.scorer import make_scorer\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n# Make a custom metric function\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\n# Make a custom a scorer from the custom metric function\n# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.\nmy_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_scorer)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\n\n\n\nYou can pass a metric function with the signature \nscore_func(y_true, y_pred)\n (e.g. \nmy_custom_accuracy\n in the example above), where \ny_true\n are the true target values and \ny_pred\n are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized (\ngreater_is_better=False\n in \nmake_scorer\n), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\n\n\n\n\nmy_module.scorer_name\n: You can also use a custom \nscore_func(y_true, y_pred)\n or \nscorer(estimator, X, y)\n function through the command line by adding the argument \n-scoring my_module.scorer\n to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT.\nExample: \n-scoring sklearn.metrics.auc\n will use the function auc from sklearn.metrics module.\n\n\n\n\n\n\nBuilt-in TPOT configurations\n\n\nTPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT.\n\n\n\n\n\n\nConfiguration Name\n\n\nDescription\n\n\nOperators\n\n\n\n\n\n\n\nDefault TPOT\n\n\nTPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets.\n\n\n\nNote: This is the default configuration for TPOT.\n To use this configuration, use the default value (None) for the config_dict parameter.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT light\n\n\nTPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT MDR\n\n\nTPOT will search over a series of feature selectors and \nMultifactor Dimensionality Reduction\n models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for \ngenome-wide association studies (GWAS)\n, and is described in detail online \nhere\n.\n\n\nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT sparse\n\n\nTPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\n\nTo use any of these configurations, simply pass the string name of the configuration to the \nconfig_dict\n parameter (or \n-config\n on the command line). For example, to use the \"TPOT light\" configuration:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\nCustomizing TPOT's operators and parameters\n\n\nBeyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters.\n\n\nThe custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., \nsklearn.naive_bayes.MultinomialNB\n) and the second level key is the corresponding parameter name for that operator (e.g., \nfit_prior\n). The second level key should point to a list of parameter values for that parameter, e.g., \n'fit_prior': [True, False]\n.\n\n\nFor a simple example, the configuration could be:\n\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\n\n\n\nin which case TPOT would only consider pipelines containing \nGaussianNB\n, \nBernoulliNB\n, \nMultinomialNB\n, and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the \nTPOTClassifier\n/\nTPOTRegressor\n \nconfig_dict\n parameter, described above. For example:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nCommand-line users must create a separate \n.py\n file with the custom configuration and provide the path to the file to the \ntpot\n call. For example, if the simple example configuration above is saved in \ntpot_classifier_config.py\n, that configuration could be used on the command line with the command:\n\n\ntpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py\n\n\n\n\nWhen using the command-line interface, the configuration file specified in the \n-config\n parameter \nmust\n name its custom TPOT configuration \ntpot_config\n. Otherwise, TPOT will not be able to locate the configuration dictionary.\n\n\nFor more detailed examples of how to customize TPOT's operator configuration, see the default configurations for \nclassification\n and \nregression\n in TPOT's source code.\n\n\nNote that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.\n\n\nTemplate option in TPOT\n\n\nTemplate option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines.\n\n\nBelow is a simple example to use \ntemplate\n option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of \nSelectorMixin\n), 2nd step is a feature transformer (a subclass of \nTransformerMixin\n) and 3rd step is a classifier for classification (a subclass of \nClassifierMixin\n). The last step must be \nClassifier\n for \nTPOTClassifier\n's template but \nRegressor\n for \nTPOTRegressor\n. \nNote: although \nSelectorMixin\n is subclass of \nTransformerMixin\n in scikit-leawrn, but \nTransformer\n in this option excludes those subclasses of \nSelectorMixin\n.\n\n\ntpot_obj = TPOTClassifier(\n template='Selector-Transformer-Classifier'\n )\n\n\n\n\nIf a specific operator, e.g. \nSelectPercentile\n, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.\n\n\nFeatureSetSelector in TPOT\n\n\nFeatureSetSelector\n is a special new operator in TPOT. This operator enables feature selection based on \npriori\n export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database (\nMSigDB\n) in the 1st step of pipeline via \ntemplate\n option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT.\n\n\nfrom tpot import TPOTClassifier\nimport numpy as np\nimport pandas as pd\nfrom tpot.config import classifier_config_dict\ntest_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\")\ntest_X = test_data.drop(\"class\", axis=1)\ntest_y = test_data['class']\n\n# add FeatureSetSelector into tpot configuration\nclassifier_config_dict['tpot.builtins.FeatureSetSelector'] = {\n 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],\n 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above\n #'sel_subset': list(combinations(range(3), 2)) # select two feature sets\n}\n\n\ntpot = TPOTClassifier(generations=5,\n population_size=50, verbosity=2,\n template='FeatureSetSelector-Transformer-Classifier',\n config_dict=classifier_config_dict)\ntpot.fit(test_X, test_y)\n\n\n\n\nPipeline caching in TPOT\n\n\nWith the \nmemory\n parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or \nsklearn.external.joblib.Memory\n in case they want to re-use the memory cache in future TPOT runs (or a \nwarm_start\n run).\n\n\nThere are three methods for enabling memory caching in TPOT:\n\n\nfrom tpot import TPOTClassifier\nfrom tempfile import mkdtemp\nfrom sklearn.externals.joblib import Memory\nfrom shutil import rmtree\n\n# Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown\ntpot = TPOTClassifier(memory='auto')\n\n# Method 2, with a custom directory for memory caching\ntpot = TPOTClassifier(memory='/to/your/path')\n\n# Method 3, with a Memory object\ncachedir = mkdtemp() # Create a temporary folder\nmemory = Memory(cachedir=cachedir, verbose=0)\ntpot = TPOTClassifier(memory=memory)\n\n# Clear the cache directory when you don't need it anymore\nrmtree(cachedir)\n\n\n\n\nNote: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.\n\n\nCrash/freeze issue with n_jobs > 1 under OSX or Linux\n\n\nInternally, TPOT uses \njoblib\n to fit estimators in parallel.\nThis is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux \nas scikit-learn does\n, especially with large datasets.\n\n\nOne solution is to configure Python's \nmultiprocessing\n module to use the \nforkserver\n start method (instead of the default \nfork\n) to manage the process pools. You can enable the \nforkserver\n mode globally for your program by putting the following codes into your main script:\n\n\nimport multiprocessing\n\n# other imports, custom code, load data, define model...\n\nif __name__ == '__main__':\n multiprocessing.set_start_method('forkserver')\n\n # call scikit-learn utils or tpot utils with n_jobs > 1 here\n\n\n\n\nMore information about these start methods can be found in the \nmultiprocessing documentation\n.\n\n\nParallel Training with Dask\n\n\nFor large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a \nDask\n cluster.\nThe \ndask-examples binder\n has a runnable example\nwith a small dask cluster.\n\n\nTo use your Dask cluster to fit a TPOT model, specify the \nuse_dask\n keyword when you create the TPOT estimator. \nNote: if \nuse_dask=True\n, TPOT will use as many cores as available on the your Dask cluster. If \nn_jobs\n is specified, then it will control the chunk size (10*\nn_jobs\n if it is less then offspring size) of parallel training. \n\n\nestimator = TPOTEstimator(use_dask=True, n_jobs=-1)\n\n\n\n\nThis will use use all the workers on your cluster to do the training, and use \nDask-ML's pipeline rewriting\n to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the \ndistributed scheduler UI\n.\n\n\nAlternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a \njoblib.parallel_backend\n:\n\n\nfrom sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y)\n\n\n\n\nSee \ndask's distributed joblib integration\n for more.", "title": "Using TPOT" }, { @@ -27,7 +27,7 @@ }, { "location": "/using/#tpot-on-the-command-line", - "text": "To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. \nBy default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. \nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. \nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one. \nSee the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. \nAssigning this to -1 will use as many cores as available on the computer. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. \nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. \nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path \nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. \nThis is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working \nExample: \nmkdir my_checkpoints \n-cf ./my_checkpoints -es EARLY_STOP Any positive integer \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnd optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. \n0 = none, 1 = minimal, 2 = high, 3 = all. \nA setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.", + "text": "To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. \nBy default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. \nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. \nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one. \nSee the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. \nAssigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. \nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. \nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path \nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. \nThis is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working \nExample: \nmkdir my_checkpoints \n-cf ./my_checkpoints -es EARLY_STOP Any positive integer \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnd optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. \n0 = none, 1 = minimal, 2 = high, 3 = all. \nA setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.", "title": "TPOT on the command line" }, { @@ -45,6 +45,16 @@ "text": "Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n} in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.", "title": "Customizing TPOT's operators and parameters" }, + { + "location": "/using/#template-option-in-tpot", + "text": "Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier(\n template='Selector-Transformer-Classifier'\n ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.", + "title": "Template option in TPOT" + }, + { + "location": "/using/#featuresetselector-in-tpot", + "text": "FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT. from tpot import TPOTClassifier\nimport numpy as np\nimport pandas as pd\nfrom tpot.config import classifier_config_dict\ntest_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\")\ntest_X = test_data.drop(\"class\", axis=1)\ntest_y = test_data['class']\n\n# add FeatureSetSelector into tpot configuration\nclassifier_config_dict['tpot.builtins.FeatureSetSelector'] = {\n 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],\n 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above\n #'sel_subset': list(combinations(range(3), 2)) # select two feature sets\n}\n\n\ntpot = TPOTClassifier(generations=5,\n population_size=50, verbosity=2,\n template='FeatureSetSelector-Transformer-Classifier',\n config_dict=classifier_config_dict)\ntpot.fit(test_X, test_y)", + "title": "FeatureSetSelector in TPOT" + }, { "location": "/using/#pipeline-caching-in-tpot", "text": "With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or sklearn.external.joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier\nfrom tempfile import mkdtemp\nfrom sklearn.externals.joblib import Memory\nfrom shutil import rmtree\n\n# Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown\ntpot = TPOTClassifier(memory='auto')\n\n# Method 2, with a custom directory for memory caching\ntpot = TPOTClassifier(memory='/to/your/path')\n\n# Method 3, with a Memory object\ncachedir = mkdtemp() # Create a temporary folder\nmemory = Memory(cachedir=cachedir, verbose=0)\ntpot = TPOTClassifier(memory=memory)\n\n# Clear the cache directory when you don't need it anymore\nrmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.", @@ -57,22 +67,22 @@ }, { "location": "/using/#parallel-training-with-dask", - "text": "For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster.\nThe dask-examples binder has a runnable example\nwith a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster regardless of whether n_jobs is specified. estimator = TPOTEstimator(use_dask=True) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a joblib.parallel_backend : from sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y) See dask's distributed joblib integration for more.", + "text": "For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster.\nThe dask-examples binder has a runnable example\nwith a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a joblib.parallel_backend : from sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y) See dask's distributed joblib integration for more.", "title": "Parallel Training with Dask" }, { "location": "/api/", - "text": "Classification\n\n\nclass\n tpot.\nTPOTClassifier\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='accuracy', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \nwarm_start\n=False,\n \nmemory\n=None,\n \nuse_dask\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nearly_stop\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised classification tasks.\n\n\nThe TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=None)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='accuracy')\n\n\nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used:\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'\n\n\nIf you would like to use a custom scorer, you can pass the callable object/function with signature \nscorer(estimator, X, y)\n.\n\n\nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature \nscore_func(y_true, y_pred)\n. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a StratifiedKFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: float, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTClassifier configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nmemory\n: a sklearn.external.joblib.Memory object or string, optional (default=None)\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in \nscikit-learn documentation\n\n\n\nPossible inputs are:\n\n\n\nString 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or\n\n\nPath of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nMemory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nNone, TPOT does not use memory caching.\n\n\n\n\n\n\n\nuse_dask\n: boolean, optional (default: False)\n\n\nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler.\n\n\nSee \navoid repeated work\n for more details.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \npareto_front_fitted_pipelines_\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, classes[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\npredict_proba\n(features)\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_classes)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, classes, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\nclasses\n: array-like {n_samples}\n\n\nList of class labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted classes for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict_proba(features)\n\n\n\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\nNote: This function will only work for pipelines whose final classifier supports the \npredict_proba\n function. TPOT will raise an error otherwise.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples, n_classes}\n\n\nThe class probabilities of the input samples\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_classes)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'accuracy'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_classes\n: array-like {n_samples}\n\n\nList of class labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything\n\n\n\n\n\n\n\n\n\n\nRegression\n\n\nclass\n tpot.\nTPOTRegressor\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='neg_mean_squared_error', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \nwarm_start\n=False,\n \nmemory\n=None,\n \nuse_dask\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nearly_stop\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised regression tasks.\n\n\nThe TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=None)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='neg_mean_squared_error')\n\n\nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used:\n\n\n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'\n\n\nNote that we recommend using the \nneg\n version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric.\n\n\nIf you would like to use a custom scorer, you can pass the callable object/function with signature \nscorer(estimator, X, y)\n.\n\n\nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature \nscore_func(y_true, y_pred)\n. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a KFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: float, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTRegressor configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nmemory\n: a sklearn.external.joblib.Memory object or string, optional (default=None)\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in \nscikit-learn documentation\n\n\n\nPossible inputs are:\n\n\n\nString 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or\n\n\nPath of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nMemory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nNone, TPOT does not use memory caching.\n\n\n\n\n\n\n\nuse_dask\n: boolean, optional (default: False)\n\n\nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler.\n\n\nSee \navoid repeated work\n for more details.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \n_pareto_front_fitted_pipelines\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, target[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_target)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, target, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\ntarget\n: array-like {n_samples}\n\n\nList of target labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted target values for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_target)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'mean_squared_error'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_target\n: array-like {n_samples}\n\n\nList of target labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything", + "text": "Classification\n\n\nclass\n tpot.\nTPOTClassifier\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='accuracy', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \ntemplate\n=\"RandomTree\",\n \nwarm_start\n=False,\n \nmemory\n=None,\n \nuse_dask\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nearly_stop\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised classification tasks.\n\n\nThe TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=None)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='accuracy')\n\n\nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used:\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'\n\n\nIf you would like to use a custom scorer, you can pass the callable object/function with signature \nscorer(estimator, X, y)\n.\n\n\nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature \nscore_func(y_true, y_pred)\n. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a StratifiedKFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets.\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: float, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTClassifier configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\ntemplate\n: string (default=\"RandomTree\")\n\n\nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT.\n\n\nSo far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are splited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the \n template option in tpot\n section for more details.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nmemory\n: a sklearn.external.joblib.Memory object or string, optional (default=None)\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in \nscikit-learn documentation\n\n\n\nPossible inputs are:\n\n\n\nString 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or\n\n\nPath of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nMemory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nNone, TPOT does not use memory caching.\n\n\n\n\n\n\n\nuse_dask\n: boolean, optional (default: False)\n\n\nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler.\n\n\nSee \navoid repeated work\n for more details.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \npareto_front_fitted_pipelines_\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, classes[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\npredict_proba\n(features)\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_classes)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, classes, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\nclasses\n: array-like {n_samples}\n\n\nList of class labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted classes for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict_proba(features)\n\n\n\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\nNote: This function will only work for pipelines whose final classifier supports the \npredict_proba\n function. TPOT will raise an error otherwise.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples, n_classes}\n\n\nThe class probabilities of the input samples\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_classes)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'accuracy'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_classes\n: array-like {n_samples}\n\n\nList of class labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything\n\n\n\n\n\n\n\n\n\n\nRegression\n\n\nclass\n tpot.\nTPOTRegressor\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='neg_mean_squared_error', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \ntemplate\n=\"RandomTree\",\n \nwarm_start\n=False,\n \nmemory\n=None,\n \nuse_dask\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nearly_stop\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised regression tasks.\n\n\nThe TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=None)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='neg_mean_squared_error')\n\n\nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used:\n\n\n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'\n\n\nNote that we recommend using the \nneg\n version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric.\n\n\nIf you would like to use a custom scorer, you can pass the callable object/function with signature \nscorer(estimator, X, y)\n.\n\n\nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature \nscore_func(y_true, y_pred)\n. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a KFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: float, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTRegressor configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\ntemplate\n: string (default=\"RandomTree\")\n\n\nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT.\n\n\nSo far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the \n template option in tpot\n section for more details.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nmemory\n: a sklearn.external.joblib.Memory object or string, optional (default=None)\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in \nscikit-learn documentation\n\n\n\nPossible inputs are:\n\n\n\nString 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or\n\n\nPath of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nMemory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nNone, TPOT does not use memory caching.\n\n\n\n\n\n\n\nuse_dask\n: boolean, optional (default: False)\n\n\nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler.\n\n\nSee \navoid repeated work\n for more details.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \n_pareto_front_fitted_pipelines\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, target[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_target)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, target, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\ntarget\n: array-like {n_samples}\n\n\nList of target labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted target values for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_target)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'mean_squared_error'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_target\n: array-like {n_samples}\n\n\nList of target labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything", "title": "TPOT API" }, { "location": "/api/#classification", - "text": "class tpot. TPOTClassifier ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='accuracy', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n warm_start =False,\n memory =None,\n use_dask =False,\n periodic_checkpoint_folder =None,\n early_stop =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') \nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: \n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' \nIf you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . \nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. \nSee the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a sklearn.external.joblib.Memory object or string, optional (default=None) \nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation \nPossible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) \nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler. \nSee avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} \nList of class labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted classes for the samples in the feature matrix predict_proba(features) \nUse the optimized pipeline to estimate the class probabilities for a feature set. \nNote: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples, n_classes} \nThe class probabilities of the input samples score(testing_features, testing_classes) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_classes : array-like {n_samples} \nList of class labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", + "text": "class tpot. TPOTClassifier ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='accuracy', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n template =\"RandomTree\",\n warm_start =False,\n memory =None,\n use_dask =False,\n periodic_checkpoint_folder =None,\n early_stop =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') \nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: \n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' \nIf you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . \nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. \nSee the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=\"RandomTree\") \nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. \nSo far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are splited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the template option in tpot section for more details. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a sklearn.external.joblib.Memory object or string, optional (default=None) \nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation \nPossible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) \nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler. \nSee avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} \nList of class labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted classes for the samples in the feature matrix predict_proba(features) \nUse the optimized pipeline to estimate the class probabilities for a feature set. \nNote: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples, n_classes} \nThe class probabilities of the input samples score(testing_features, testing_classes) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_classes : array-like {n_samples} \nList of class labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", "title": "Classification" }, { "location": "/api/#regression", - "text": "class tpot. TPOTRegressor ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='neg_mean_squared_error', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n warm_start =False,\n memory =None,\n use_dask =False,\n periodic_checkpoint_folder =None,\n early_stop =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') \nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: \n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' \nNote that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. \nIf you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . \nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. \nSee the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a sklearn.external.joblib.Memory object or string, optional (default=None) \nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation \nPossible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) \nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler. \nSee avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} \nList of target labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted target values for the samples in the feature matrix score(testing_features, testing_target) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_target : array-like {n_samples} \nList of target labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", + "text": "class tpot. TPOTRegressor ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='neg_mean_squared_error', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n template =\"RandomTree\",\n warm_start =False,\n memory =None,\n use_dask =False,\n periodic_checkpoint_folder =None,\n early_stop =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') \nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: \n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' \nNote that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. \nIf you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . \nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. \nSee the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=\"RandomTree\") \nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. \nSo far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the template option in tpot section for more details. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a sklearn.external.joblib.Memory object or string, optional (default=None) \nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation \nPossible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) \nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler. \nSee avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} \nList of target labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted target values for the samples in the feature matrix score(testing_features, testing_target) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_target : array-like {n_samples} \nList of target labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", "title": "Regression" }, { diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 09f8842a..5b7788e7 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ http://epistasislab.github.io/tpot/ - 2019-03-01 + 2019-04-11 daily @@ -12,7 +12,7 @@ http://epistasislab.github.io/tpot/installing/ - 2019-03-01 + 2019-04-11 daily @@ -20,7 +20,7 @@ http://epistasislab.github.io/tpot/using/ - 2019-03-01 + 2019-04-11 daily @@ -28,7 +28,7 @@ http://epistasislab.github.io/tpot/api/ - 2019-03-01 + 2019-04-11 daily @@ -36,7 +36,7 @@ http://epistasislab.github.io/tpot/examples/ - 2019-03-01 + 2019-04-11 daily @@ -44,7 +44,7 @@ http://epistasislab.github.io/tpot/contributing/ - 2019-03-01 + 2019-04-11 daily @@ -52,7 +52,7 @@ http://epistasislab.github.io/tpot/releases/ - 2019-03-01 + 2019-04-11 daily @@ -60,7 +60,7 @@ http://epistasislab.github.io/tpot/citing/ - 2019-03-01 + 2019-04-11 daily @@ -68,7 +68,7 @@ http://epistasislab.github.io/tpot/support/ - 2019-03-01 + 2019-04-11 daily @@ -76,7 +76,7 @@ http://epistasislab.github.io/tpot/related/ - 2019-03-01 + 2019-04-11 daily diff --git a/docs/using/index.html b/docs/using/index.html index fd6ed07d..3e30f6e2 100644 --- a/docs/using/index.html +++ b/docs/using/index.html @@ -80,6 +80,12 @@
  • Customizing TPOT's operators and parameters
  • +
  • Template option in TPOT
  • + + +
  • FeatureSetSelector in TPOT
  • + +
  • Pipeline caching in TPOT
  • @@ -367,7 +373,7 @@

    TPOT on the command line

    Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process.

    -Assigning this to -1 will use as many cores as available on the computer. +Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime @@ -409,6 +415,15 @@

    TPOT on the command line

    +-template +TEMPLATE +String +Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. + + + -memory MEMORY String or file path @@ -641,6 +656,40 @@

    Customizing TPOT's operators

    When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config. Otherwise, TPOT will not be able to locate the configuration dictionary.

    For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code.

    Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.

    +

    Template option in TPOT

    +

    Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines.

    +

    Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin), 2nd step is a feature transformer (a subclass of TransformerMixin) and 3rd step is a classifier for classification (a subclass of ClassifierMixin). The last step must be Classifier for TPOTClassifier's template but Regressor for TPOTRegressor. Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin.

    +
    tpot_obj = TPOTClassifier(
    +                template='Selector-Transformer-Classifier'
    +                )
    +
    + +

    If a specific operator, e.g. SelectPercentile, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.

    +

    FeatureSetSelector in TPOT

    +

    FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database (MSigDB) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT.

    +
    from tpot import TPOTClassifier
    +import numpy as np
    +import pandas as pd
    +from tpot.config import classifier_config_dict
    +test_data = pd.read_csv("https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv")
    +test_X = test_data.drop("class", axis=1)
    +test_y = test_data['class']
    +
    +# add FeatureSetSelector into tpot configuration
    +classifier_config_dict['tpot.builtins.FeatureSetSelector'] = {
    +    'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],
    +    'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above
    +    #'sel_subset': list(combinations(range(3), 2)) # select two feature sets
    +}
    +
    +
    +tpot = TPOTClassifier(generations=5,
    +                           population_size=50, verbosity=2,
    +                           template='FeatureSetSelector-Transformer-Classifier',
    +                           config_dict=classifier_config_dict)
    +tpot.fit(test_X, test_y)
    +
    +

    Pipeline caching in TPOT

    With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or sklearn.external.joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run).

    There are three methods for enabling memory caching in TPOT:

    @@ -684,8 +733,8 @@

    Parallel Training with Dask

    For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster.

    -

    To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True, TPOT will use as many cores as available on the your Dask cluster regardless of whether n_jobs is specified.

    -
    estimator = TPOTEstimator(use_dask=True)
    +

    To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True, TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10*n_jobs if it is less then offspring size) of parallel training.

    +
    estimator = TPOTEstimator(use_dask=True, n_jobs=-1)
     

    This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. diff --git a/docs_sources/using.md b/docs_sources/using.md index dbe9143e..24d01679 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -538,7 +538,7 @@ Note that you must have all of the corresponding packages for the operators inst # Template option in TPOT -Template option is added into TPOT v0.10.0. It provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. +Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use `template` option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17)), 2nd step is a feature transformer (a subclass of [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html)) and 3rd step is a classifier for classification (a subclass of [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html)). The last step must be `Classifier` for `TPOTClassifier`'s template but `Regressor` for `TPOTRegressor`. **Note: although `SelectorMixin` is subclass of `TransformerMixin` in scikit-leawrn, but `Transformer` in this option excludes those subclasses of `SelectorMixin`.** @@ -553,20 +553,20 @@ If a specific operator, e.g. `SelectPercentile`, is prefered to used in the 1st # FeatureSetSelector in TPOT -`FeatureSetSelector` is a special new operator in TPOT. This operator enables feature selection based on *priori* export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ([MSigDB](http://software.broadinstitute.org/gsea/msigdb/index.jsp)) to reduce dimensions and saving computing time. Below is a example for using this operator in TPOT. +`FeatureSetSelector` is a special new operator in TPOT. This operator enables feature selection based on *priori* export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ([MSigDB](http://software.broadinstitute.org/gsea/msigdb/index.jsp)) in the 1st step of pipeline via `template` option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT. ```Python from tpot import TPOTClassifier import numpy as np import pandas as pd from tpot.config import classifier_config_dict -test_data = pd.read_csv("https://raw.githubusercontent.com/weixuanfu/tpot/master/tests/tests.csv") +test_data = pd.read_csv("https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv") test_X = test_data.drop("class", axis=1) test_y = test_data['class'] # add FeatureSetSelector into tpot configuration classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { - 'subset_list': ['https://raw.githubusercontent.com/weixuanfu/tpot/master/tests/subset_test.csv'], + 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above #'sel_subset': list(combinations(range(3), 2)) # select two feature sets } From b1b61e41897c8d52909162b9307ec5a55e69b227 Mon Sep 17 00:00:00 2001 From: weixuanfu Date: Thu, 11 Apr 2019 14:36:53 -0400 Subject: [PATCH 59/60] refine html pages --- docs/index.html | 2 +- docs/search/search_index.json | 4 ++-- docs/using/index.html | 1 + docs_sources/using.md | 2 ++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/index.html b/docs/index.html index 437e5eae..6000e6a3 100644 --- a/docs/index.html +++ b/docs/index.html @@ -213,5 +213,5 @@ diff --git a/docs/search/search_index.json b/docs/search/search_index.json index 183bf6ca..768f17b3 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -12,7 +12,7 @@ }, { "location": "/using/", - "text": "What to expect from AutoML software\n\n\nAutomated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to,\nso we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.\n\n\nAutoML algorithms aren't intended to run for only a few minutes\n\n\n\nOf course, you \ncan\n run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset.\nHowever, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not\nfind any suitable pipeline at all, in which case a \nRuntimeError('A pipeline has not yet been optimized. Please call fit() first.')\n\nwill be raised.\nOften it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search\nthe pipeline space for your dataset.\n\n\nAutoML algorithms can take a long time to finish their search\n\n\n\nAutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms\n(random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling,\nPCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways\nto ensemble or stack the algorithms within the pipeline.\n\n\nAs such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings\n(100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing.\nTo put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm\nand how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation,\nwhich means that roughly 100,000 models are fit and evaluated on the training data in one grid search.\nThat's a time-consuming procedure, even for simpler models like decision trees.\n\n\nTypical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt\nthe run partway through and see the best results so far. TPOT also \nprovides\n a \nwarm_start\n parameter that\nlets you restart a TPOT run from where it left off.\n\n\nAutoML algorithms can recommend different solutions for the same dataset\n\n\n\nIf you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs\nmay result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means\nthat it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different\npipelines, this means that the TPOT runs didn't converge due to lack of time \nor\n that multiple pipelines\nperform more-or-less the same on your dataset.\n\n\nThis is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives\nyou ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you\nmight have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such\nas grid search.\n\n\nTPOT with code\n\n\nWe've taken care to design the TPOT interface to be as similar as possible to scikit-learn.\n\n\nTPOT can be imported just like any regular Python module. To import TPOT, type:\n\n\nfrom tpot import TPOTClassifier\n\n\n\n\nthen create an instance of TPOT as follows:\n\n\npipeline_optimizer = TPOTClassifier()\n\n\n\n\nIt's also possible to use TPOT for regression problems with the \nTPOTRegressor\n class. Other than the class name,\na \nTPOTRegressor\n is used the same way as a \nTPOTClassifier\n. You can read more about the \nTPOTClassifier\n and \nTPOTRegressor\n classes in the \nAPI documentation\n.\n\n\nSome example code with custom TPOT parameters might look like:\n\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\n\n\n\n\nNow TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the \nfit\n function:\n\n\npipeline_optimizer.fit(X_train, y_train)\n\n\n\n\nThe \nfit\n function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation\nThen, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model.\n\n\nYou can then proceed to evaluate the final pipeline on the testing set with the \nscore\n function:\n\n\nprint(pipeline_optimizer.score(X_test, y_test))\n\n\n\n\nFinally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the \nexport\n function:\n\n\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nOnce this code finishes running, \ntpot_exported_pipeline.py\n will contain the Python code for the optimized pipeline.\n\n\nBelow is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file.\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\npipeline_optimizer.fit(X_train, y_train)\nprint(pipeline_optimizer.score(X_test, y_test))\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nCheck our \nexamples\n to see TPOT applied to some specific data sets.\n\n\nTPOT on the command line\n\n\nTo use TPOT via the command line, enter the following command with a path to the data file:\n\n\ntpot /path_to/data_file.csv\n\n\n\n\nAn example command-line call to TPOT may look like:\n\n\ntpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2\n\n\n\n\nTPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command:\n\n\ntpot --help\n\n\n\n\nDetailed descriptions of the command-line arguments are below.\n\n\n\n\n\n\nArgument\n\n\nParameter\n\n\nValid values\n\n\nEffect\n\n\n\n\n\n\n-is\n\n\nINPUT_SEPARATOR\n\n\nAny string\n\n\nCharacter used to separate columns in the input file.\n\n\n\n\n\n\n-target\n\n\nTARGET_NAME\n\n\nAny string\n\n\nName of the target column in the input file.\n\n\n\n\n\n\n-mode\n\n\nTPOT_MODE\n\n\n['classification', 'regression']\n\n\nWhether TPOT is being used for a supervised classification or regression problem.\n\n\n\n\n\n\n-o\n\n\nOUTPUT_FILE\n\n\nString path to a file\n\n\nFile to export the code for the final optimized pipeline.\n\n\n\n\n\n\n-g\n\n\nGENERATIONS\n\n\nAny positive integer\n\n\nNumber of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-p\n\n\nPOPULATION_SIZE\n\n\nAny positive integer\n\n\nNumber of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-os\n\n\nOFFSPRING_SIZE\n\n\nAny positive integer\n\n\nNumber of offspring to produce in each GP generation.\n\n\nBy default, OFFSPRING_SIZE = POPULATION_SIZE.\n\n\n\n\n\n\n-mr\n\n\nMUTATION_RATE\n\n\n[0.0, 1.0]\n\n\nGP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\n\n\n-xr\n\n\nCROSSOVER_RATE\n\n\n[0.0, 1.0]\n\n\nGP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\n\n\n-scoring\n\n\nSCORING_FN\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',\n'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted',\n'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*'\n\n\nFunction used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized.\n\n\nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\n\n-cv\n\n\nCV\n\n\nAny integer > 1\n\n\nNumber of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.\n\n\n\n\n-sub\n\n\nSUBSAMPLE\n\n\n(0.0, 1.0]\n\n\nSubsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.\n\n\n\n\n\n\n-njobs\n\n\nNUM_JOBS\n\n\nAny positive integer or -1\n\n\nNumber of CPUs for evaluating pipelines in parallel during the TPOT optimization process.\n\n\nAssigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.\n\n\n\n\n\n\n-maxtime\n\n\nMAX_TIME_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time.\n\n\n\n\n\n\n-maxeval\n\n\nMAX_EVAL_MINS\n\n\nAny positive float\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer.\n\n\n\n\n\n\n-s\n\n\nRANDOM_STATE\n\n\nAny positive integer\n\n\nRandom number generator seed for reproducibility.\n\n\nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future.\n\n\n\n\n\n\n-config\n\n\nCONFIG_FILE\n\n\nString or file path\n\n\nOperators and parameter configurations in TPOT:\n\n\n\n\n\nPath for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\n\n\n\n-template\n\n\nTEMPLATE\n\n\nString\n\n\nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the \n template option in tpot\n section for more details.\n\n\n\n\n\n\n\n-memory\n\n\nMEMORY\n\n\nString or file path\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT:\n\n\n\n\n\nPath for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown.\n\n\nstring 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown.\n\n\n\n\n\n\n\n\n\n\n-cf\n\n\nCHECKPOINT_FOLDER\n\n\nFolder path\n\n\n\nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing.\n\n\nThis is useful in multiple cases:\n\n\n\nsudden death before tpot could save an optimized pipeline\n\n\nprogress tracking\n\n\ngrabbing a pipeline while tpot is working\n\n\n\n\n\nExample:\n\n\nmkdir my_checkpoints\n\n\n-cf ./my_checkpoints\n\n\n\n\n\n-es\n\n\nEARLY_STOP\n\n\nAny positive integer\n\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnd optimization process if there is no improvement in the set number of generations.\n\n\n\n\n\n-v\n\n\nVERBOSITY\n\n\n{0, 1, 2, 3}\n\n\nHow much information TPOT communicates while it is running.\n\n\n0 = none, 1 = minimal, 2 = high, 3 = all.\n\n\nA setting of 2 or higher will add a progress bar during the optimization procedure.\n\n\n\n\n\n\n--no-update-check\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\n\n\n\n\n--version\n\n\nShow TPOT's version number and exit.\n\n\n\n\n\n\n--help\n\n\nShow TPOT's help documentation and exit.\n\n\n\n\n\n\n\nScoring functions\n\n\nTPOT makes use of \nsklearn.model_selection.cross_val_score\n for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT:\n\n\n\n\n\n\nYou can pass in a string to the \nscoring\n parameter from the list above. Any other strings will cause TPOT to throw an exception.\n\n\n\n\n\n\nYou can pass the callable object/function with signature \nscorer(estimator, X, y)\n, where \nestimator\n is trained estimator to use for scoring, \nX\n are features that will be passed to \nestimator.predict\n and \ny\n are target values for \nX\n. To do this, you should implement your own function. See the example below for further explanation.\n\n\n\n\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics.scorer import make_scorer\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n# Make a custom metric function\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\n# Make a custom a scorer from the custom metric function\n# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.\nmy_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_scorer)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\n\n\n\nYou can pass a metric function with the signature \nscore_func(y_true, y_pred)\n (e.g. \nmy_custom_accuracy\n in the example above), where \ny_true\n are the true target values and \ny_pred\n are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized (\ngreater_is_better=False\n in \nmake_scorer\n), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\n\n\n\n\nmy_module.scorer_name\n: You can also use a custom \nscore_func(y_true, y_pred)\n or \nscorer(estimator, X, y)\n function through the command line by adding the argument \n-scoring my_module.scorer\n to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT.\nExample: \n-scoring sklearn.metrics.auc\n will use the function auc from sklearn.metrics module.\n\n\n\n\n\n\nBuilt-in TPOT configurations\n\n\nTPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT.\n\n\n\n\n\n\nConfiguration Name\n\n\nDescription\n\n\nOperators\n\n\n\n\n\n\n\nDefault TPOT\n\n\nTPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets.\n\n\n\nNote: This is the default configuration for TPOT.\n To use this configuration, use the default value (None) for the config_dict parameter.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT light\n\n\nTPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT MDR\n\n\nTPOT will search over a series of feature selectors and \nMultifactor Dimensionality Reduction\n models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for \ngenome-wide association studies (GWAS)\n, and is described in detail online \nhere\n.\n\n\nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT sparse\n\n\nTPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\n\nTo use any of these configurations, simply pass the string name of the configuration to the \nconfig_dict\n parameter (or \n-config\n on the command line). For example, to use the \"TPOT light\" configuration:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\nCustomizing TPOT's operators and parameters\n\n\nBeyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters.\n\n\nThe custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., \nsklearn.naive_bayes.MultinomialNB\n) and the second level key is the corresponding parameter name for that operator (e.g., \nfit_prior\n). The second level key should point to a list of parameter values for that parameter, e.g., \n'fit_prior': [True, False]\n.\n\n\nFor a simple example, the configuration could be:\n\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\n\n\n\nin which case TPOT would only consider pipelines containing \nGaussianNB\n, \nBernoulliNB\n, \nMultinomialNB\n, and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the \nTPOTClassifier\n/\nTPOTRegressor\n \nconfig_dict\n parameter, described above. For example:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nCommand-line users must create a separate \n.py\n file with the custom configuration and provide the path to the file to the \ntpot\n call. For example, if the simple example configuration above is saved in \ntpot_classifier_config.py\n, that configuration could be used on the command line with the command:\n\n\ntpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py\n\n\n\n\nWhen using the command-line interface, the configuration file specified in the \n-config\n parameter \nmust\n name its custom TPOT configuration \ntpot_config\n. Otherwise, TPOT will not be able to locate the configuration dictionary.\n\n\nFor more detailed examples of how to customize TPOT's operator configuration, see the default configurations for \nclassification\n and \nregression\n in TPOT's source code.\n\n\nNote that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.\n\n\nTemplate option in TPOT\n\n\nTemplate option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines.\n\n\nBelow is a simple example to use \ntemplate\n option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of \nSelectorMixin\n), 2nd step is a feature transformer (a subclass of \nTransformerMixin\n) and 3rd step is a classifier for classification (a subclass of \nClassifierMixin\n). The last step must be \nClassifier\n for \nTPOTClassifier\n's template but \nRegressor\n for \nTPOTRegressor\n. \nNote: although \nSelectorMixin\n is subclass of \nTransformerMixin\n in scikit-leawrn, but \nTransformer\n in this option excludes those subclasses of \nSelectorMixin\n.\n\n\ntpot_obj = TPOTClassifier(\n template='Selector-Transformer-Classifier'\n )\n\n\n\n\nIf a specific operator, e.g. \nSelectPercentile\n, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.\n\n\nFeatureSetSelector in TPOT\n\n\nFeatureSetSelector\n is a special new operator in TPOT. This operator enables feature selection based on \npriori\n export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database (\nMSigDB\n) in the 1st step of pipeline via \ntemplate\n option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT.\n\n\nfrom tpot import TPOTClassifier\nimport numpy as np\nimport pandas as pd\nfrom tpot.config import classifier_config_dict\ntest_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\")\ntest_X = test_data.drop(\"class\", axis=1)\ntest_y = test_data['class']\n\n# add FeatureSetSelector into tpot configuration\nclassifier_config_dict['tpot.builtins.FeatureSetSelector'] = {\n 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],\n 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above\n #'sel_subset': list(combinations(range(3), 2)) # select two feature sets\n}\n\n\ntpot = TPOTClassifier(generations=5,\n population_size=50, verbosity=2,\n template='FeatureSetSelector-Transformer-Classifier',\n config_dict=classifier_config_dict)\ntpot.fit(test_X, test_y)\n\n\n\n\nPipeline caching in TPOT\n\n\nWith the \nmemory\n parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or \nsklearn.external.joblib.Memory\n in case they want to re-use the memory cache in future TPOT runs (or a \nwarm_start\n run).\n\n\nThere are three methods for enabling memory caching in TPOT:\n\n\nfrom tpot import TPOTClassifier\nfrom tempfile import mkdtemp\nfrom sklearn.externals.joblib import Memory\nfrom shutil import rmtree\n\n# Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown\ntpot = TPOTClassifier(memory='auto')\n\n# Method 2, with a custom directory for memory caching\ntpot = TPOTClassifier(memory='/to/your/path')\n\n# Method 3, with a Memory object\ncachedir = mkdtemp() # Create a temporary folder\nmemory = Memory(cachedir=cachedir, verbose=0)\ntpot = TPOTClassifier(memory=memory)\n\n# Clear the cache directory when you don't need it anymore\nrmtree(cachedir)\n\n\n\n\nNote: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.\n\n\nCrash/freeze issue with n_jobs > 1 under OSX or Linux\n\n\nInternally, TPOT uses \njoblib\n to fit estimators in parallel.\nThis is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux \nas scikit-learn does\n, especially with large datasets.\n\n\nOne solution is to configure Python's \nmultiprocessing\n module to use the \nforkserver\n start method (instead of the default \nfork\n) to manage the process pools. You can enable the \nforkserver\n mode globally for your program by putting the following codes into your main script:\n\n\nimport multiprocessing\n\n# other imports, custom code, load data, define model...\n\nif __name__ == '__main__':\n multiprocessing.set_start_method('forkserver')\n\n # call scikit-learn utils or tpot utils with n_jobs > 1 here\n\n\n\n\nMore information about these start methods can be found in the \nmultiprocessing documentation\n.\n\n\nParallel Training with Dask\n\n\nFor large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a \nDask\n cluster.\nThe \ndask-examples binder\n has a runnable example\nwith a small dask cluster.\n\n\nTo use your Dask cluster to fit a TPOT model, specify the \nuse_dask\n keyword when you create the TPOT estimator. \nNote: if \nuse_dask=True\n, TPOT will use as many cores as available on the your Dask cluster. If \nn_jobs\n is specified, then it will control the chunk size (10*\nn_jobs\n if it is less then offspring size) of parallel training. \n\n\nestimator = TPOTEstimator(use_dask=True, n_jobs=-1)\n\n\n\n\nThis will use use all the workers on your cluster to do the training, and use \nDask-ML's pipeline rewriting\n to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the \ndistributed scheduler UI\n.\n\n\nAlternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a \njoblib.parallel_backend\n:\n\n\nfrom sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y)\n\n\n\n\nSee \ndask's distributed joblib integration\n for more.", + "text": "What to expect from AutoML software\n\n\nAutomated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to,\nso we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.\n\n\nAutoML algorithms aren't intended to run for only a few minutes\n\n\n\nOf course, you \ncan\n run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset.\nHowever, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not\nfind any suitable pipeline at all, in which case a \nRuntimeError('A pipeline has not yet been optimized. Please call fit() first.')\n\nwill be raised.\nOften it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search\nthe pipeline space for your dataset.\n\n\nAutoML algorithms can take a long time to finish their search\n\n\n\nAutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms\n(random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling,\nPCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways\nto ensemble or stack the algorithms within the pipeline.\n\n\nAs such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings\n(100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing.\nTo put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm\nand how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation,\nwhich means that roughly 100,000 models are fit and evaluated on the training data in one grid search.\nThat's a time-consuming procedure, even for simpler models like decision trees.\n\n\nTypical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt\nthe run partway through and see the best results so far. TPOT also \nprovides\n a \nwarm_start\n parameter that\nlets you restart a TPOT run from where it left off.\n\n\nAutoML algorithms can recommend different solutions for the same dataset\n\n\n\nIf you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs\nmay result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means\nthat it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different\npipelines, this means that the TPOT runs didn't converge due to lack of time \nor\n that multiple pipelines\nperform more-or-less the same on your dataset.\n\n\nThis is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives\nyou ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you\nmight have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such\nas grid search.\n\n\nTPOT with code\n\n\nWe've taken care to design the TPOT interface to be as similar as possible to scikit-learn.\n\n\nTPOT can be imported just like any regular Python module. To import TPOT, type:\n\n\nfrom tpot import TPOTClassifier\n\n\n\n\nthen create an instance of TPOT as follows:\n\n\npipeline_optimizer = TPOTClassifier()\n\n\n\n\nIt's also possible to use TPOT for regression problems with the \nTPOTRegressor\n class. Other than the class name,\na \nTPOTRegressor\n is used the same way as a \nTPOTClassifier\n. You can read more about the \nTPOTClassifier\n and \nTPOTRegressor\n classes in the \nAPI documentation\n.\n\n\nSome example code with custom TPOT parameters might look like:\n\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\n\n\n\n\nNow TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the \nfit\n function:\n\n\npipeline_optimizer.fit(X_train, y_train)\n\n\n\n\nThe \nfit\n function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation\nThen, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model.\n\n\nYou can then proceed to evaluate the final pipeline on the testing set with the \nscore\n function:\n\n\nprint(pipeline_optimizer.score(X_test, y_test))\n\n\n\n\nFinally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the \nexport\n function:\n\n\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nOnce this code finishes running, \ntpot_exported_pipeline.py\n will contain the Python code for the optimized pipeline.\n\n\nBelow is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file.\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\npipeline_optimizer.fit(X_train, y_train)\nprint(pipeline_optimizer.score(X_test, y_test))\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nCheck our \nexamples\n to see TPOT applied to some specific data sets.\n\n\nTPOT on the command line\n\n\nTo use TPOT via the command line, enter the following command with a path to the data file:\n\n\ntpot /path_to/data_file.csv\n\n\n\n\nAn example command-line call to TPOT may look like:\n\n\ntpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2\n\n\n\n\nTPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command:\n\n\ntpot --help\n\n\n\n\nDetailed descriptions of the command-line arguments are below.\n\n\n\n\n\n\nArgument\n\n\nParameter\n\n\nValid values\n\n\nEffect\n\n\n\n\n\n\n-is\n\n\nINPUT_SEPARATOR\n\n\nAny string\n\n\nCharacter used to separate columns in the input file.\n\n\n\n\n\n\n-target\n\n\nTARGET_NAME\n\n\nAny string\n\n\nName of the target column in the input file.\n\n\n\n\n\n\n-mode\n\n\nTPOT_MODE\n\n\n['classification', 'regression']\n\n\nWhether TPOT is being used for a supervised classification or regression problem.\n\n\n\n\n\n\n-o\n\n\nOUTPUT_FILE\n\n\nString path to a file\n\n\nFile to export the code for the final optimized pipeline.\n\n\n\n\n\n\n-g\n\n\nGENERATIONS\n\n\nAny positive integer\n\n\nNumber of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-p\n\n\nPOPULATION_SIZE\n\n\nAny positive integer\n\n\nNumber of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-os\n\n\nOFFSPRING_SIZE\n\n\nAny positive integer\n\n\nNumber of offspring to produce in each GP generation.\n\n\nBy default, OFFSPRING_SIZE = POPULATION_SIZE.\n\n\n\n\n\n\n-mr\n\n\nMUTATION_RATE\n\n\n[0.0, 1.0]\n\n\nGP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\n\n\n-xr\n\n\nCROSSOVER_RATE\n\n\n[0.0, 1.0]\n\n\nGP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\n\n\n-scoring\n\n\nSCORING_FN\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',\n'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted',\n'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*'\n\n\nFunction used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized.\n\n\nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\n\n-cv\n\n\nCV\n\n\nAny integer > 1\n\n\nNumber of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.\n\n\n\n\n-sub\n\n\nSUBSAMPLE\n\n\n(0.0, 1.0]\n\n\nSubsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.\n\n\n\n\n\n\n-njobs\n\n\nNUM_JOBS\n\n\nAny positive integer or -1\n\n\nNumber of CPUs for evaluating pipelines in parallel during the TPOT optimization process.\n\n\nAssigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.\n\n\n\n\n\n\n-maxtime\n\n\nMAX_TIME_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time.\n\n\n\n\n\n\n-maxeval\n\n\nMAX_EVAL_MINS\n\n\nAny positive float\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer.\n\n\n\n\n\n\n-s\n\n\nRANDOM_STATE\n\n\nAny positive integer\n\n\nRandom number generator seed for reproducibility.\n\n\nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future.\n\n\n\n\n\n\n-config\n\n\nCONFIG_FILE\n\n\nString or file path\n\n\nOperators and parameter configurations in TPOT:\n\n\n\n\n\nPath for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\n\n\n\n-template\n\n\nTEMPLATE\n\n\nString\n\n\nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are splited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the \n template option in tpot\n section for more details.\n\n\n\n\n\n\n\n-memory\n\n\nMEMORY\n\n\nString or file path\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT:\n\n\n\n\n\nPath for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown.\n\n\nstring 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown.\n\n\n\n\n\n\n\n\n\n\n-cf\n\n\nCHECKPOINT_FOLDER\n\n\nFolder path\n\n\n\nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing.\n\n\nThis is useful in multiple cases:\n\n\n\nsudden death before tpot could save an optimized pipeline\n\n\nprogress tracking\n\n\ngrabbing a pipeline while tpot is working\n\n\n\n\n\nExample:\n\n\nmkdir my_checkpoints\n\n\n-cf ./my_checkpoints\n\n\n\n\n\n-es\n\n\nEARLY_STOP\n\n\nAny positive integer\n\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnd optimization process if there is no improvement in the set number of generations.\n\n\n\n\n\n-v\n\n\nVERBOSITY\n\n\n{0, 1, 2, 3}\n\n\nHow much information TPOT communicates while it is running.\n\n\n0 = none, 1 = minimal, 2 = high, 3 = all.\n\n\nA setting of 2 or higher will add a progress bar during the optimization procedure.\n\n\n\n\n\n\n--no-update-check\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\n\n\n\n\n--version\n\n\nShow TPOT's version number and exit.\n\n\n\n\n\n\n--help\n\n\nShow TPOT's help documentation and exit.\n\n\n\n\n\n\n\nScoring functions\n\n\nTPOT makes use of \nsklearn.model_selection.cross_val_score\n for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT:\n\n\n\n\n\n\nYou can pass in a string to the \nscoring\n parameter from the list above. Any other strings will cause TPOT to throw an exception.\n\n\n\n\n\n\nYou can pass the callable object/function with signature \nscorer(estimator, X, y)\n, where \nestimator\n is trained estimator to use for scoring, \nX\n are features that will be passed to \nestimator.predict\n and \ny\n are target values for \nX\n. To do this, you should implement your own function. See the example below for further explanation.\n\n\n\n\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics.scorer import make_scorer\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n# Make a custom metric function\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\n# Make a custom a scorer from the custom metric function\n# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.\nmy_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_scorer)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\n\n\n\nYou can pass a metric function with the signature \nscore_func(y_true, y_pred)\n (e.g. \nmy_custom_accuracy\n in the example above), where \ny_true\n are the true target values and \ny_pred\n are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized (\ngreater_is_better=False\n in \nmake_scorer\n), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\n\n\n\n\nmy_module.scorer_name\n: You can also use a custom \nscore_func(y_true, y_pred)\n or \nscorer(estimator, X, y)\n function through the command line by adding the argument \n-scoring my_module.scorer\n to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT.\nExample: \n-scoring sklearn.metrics.auc\n will use the function auc from sklearn.metrics module.\n\n\n\n\n\n\nBuilt-in TPOT configurations\n\n\nTPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT.\n\n\n\n\n\n\nConfiguration Name\n\n\nDescription\n\n\nOperators\n\n\n\n\n\n\n\nDefault TPOT\n\n\nTPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets.\n\n\n\nNote: This is the default configuration for TPOT.\n To use this configuration, use the default value (None) for the config_dict parameter.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT light\n\n\nTPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT MDR\n\n\nTPOT will search over a series of feature selectors and \nMultifactor Dimensionality Reduction\n models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for \ngenome-wide association studies (GWAS)\n, and is described in detail online \nhere\n.\n\n\nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT sparse\n\n\nTPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\n\nTo use any of these configurations, simply pass the string name of the configuration to the \nconfig_dict\n parameter (or \n-config\n on the command line). For example, to use the \"TPOT light\" configuration:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\nCustomizing TPOT's operators and parameters\n\n\nBeyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters.\n\n\nThe custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., \nsklearn.naive_bayes.MultinomialNB\n) and the second level key is the corresponding parameter name for that operator (e.g., \nfit_prior\n). The second level key should point to a list of parameter values for that parameter, e.g., \n'fit_prior': [True, False]\n.\n\n\nFor a simple example, the configuration could be:\n\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\n\n\n\nin which case TPOT would only consider pipelines containing \nGaussianNB\n, \nBernoulliNB\n, \nMultinomialNB\n, and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the \nTPOTClassifier\n/\nTPOTRegressor\n \nconfig_dict\n parameter, described above. For example:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nCommand-line users must create a separate \n.py\n file with the custom configuration and provide the path to the file to the \ntpot\n call. For example, if the simple example configuration above is saved in \ntpot_classifier_config.py\n, that configuration could be used on the command line with the command:\n\n\ntpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py\n\n\n\n\nWhen using the command-line interface, the configuration file specified in the \n-config\n parameter \nmust\n name its custom TPOT configuration \ntpot_config\n. Otherwise, TPOT will not be able to locate the configuration dictionary.\n\n\nFor more detailed examples of how to customize TPOT's operator configuration, see the default configurations for \nclassification\n and \nregression\n in TPOT's source code.\n\n\nNote that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.\n\n\nTemplate option in TPOT\n\n\nTemplate option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines.\n\n\nBelow is a simple example to use \ntemplate\n option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of \nSelectorMixin\n), 2nd step is a feature transformer (a subclass of \nTransformerMixin\n) and 3rd step is a classifier for classification (a subclass of \nClassifierMixin\n). The last step must be \nClassifier\n for \nTPOTClassifier\n's template but \nRegressor\n for \nTPOTRegressor\n. \nNote: although \nSelectorMixin\n is subclass of \nTransformerMixin\n in scikit-leawrn, but \nTransformer\n in this option excludes those subclasses of \nSelectorMixin\n.\n\n\ntpot_obj = TPOTClassifier(\n template='Selector-Transformer-Classifier'\n )\n\n\n\n\nIf a specific operator, e.g. \nSelectPercentile\n, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.\n\n\nFeatureSetSelector in TPOT\n\n\nFeatureSetSelector\n is a special new operator in TPOT. This operator enables feature selection based on \npriori\n export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database (\nMSigDB\n) in the 1st step of pipeline via \ntemplate\n option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT.\n\n\nPlease check our \npreprint paper\n for more details.\n\n\nfrom tpot import TPOTClassifier\nimport numpy as np\nimport pandas as pd\nfrom tpot.config import classifier_config_dict\ntest_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\")\ntest_X = test_data.drop(\"class\", axis=1)\ntest_y = test_data['class']\n\n# add FeatureSetSelector into tpot configuration\nclassifier_config_dict['tpot.builtins.FeatureSetSelector'] = {\n 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],\n 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above\n #'sel_subset': list(combinations(range(3), 2)) # select two feature sets\n}\n\n\ntpot = TPOTClassifier(generations=5,\n population_size=50, verbosity=2,\n template='FeatureSetSelector-Transformer-Classifier',\n config_dict=classifier_config_dict)\ntpot.fit(test_X, test_y)\n\n\n\n\nPipeline caching in TPOT\n\n\nWith the \nmemory\n parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or \nsklearn.external.joblib.Memory\n in case they want to re-use the memory cache in future TPOT runs (or a \nwarm_start\n run).\n\n\nThere are three methods for enabling memory caching in TPOT:\n\n\nfrom tpot import TPOTClassifier\nfrom tempfile import mkdtemp\nfrom sklearn.externals.joblib import Memory\nfrom shutil import rmtree\n\n# Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown\ntpot = TPOTClassifier(memory='auto')\n\n# Method 2, with a custom directory for memory caching\ntpot = TPOTClassifier(memory='/to/your/path')\n\n# Method 3, with a Memory object\ncachedir = mkdtemp() # Create a temporary folder\nmemory = Memory(cachedir=cachedir, verbose=0)\ntpot = TPOTClassifier(memory=memory)\n\n# Clear the cache directory when you don't need it anymore\nrmtree(cachedir)\n\n\n\n\nNote: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.\n\n\nCrash/freeze issue with n_jobs > 1 under OSX or Linux\n\n\nInternally, TPOT uses \njoblib\n to fit estimators in parallel.\nThis is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux \nas scikit-learn does\n, especially with large datasets.\n\n\nOne solution is to configure Python's \nmultiprocessing\n module to use the \nforkserver\n start method (instead of the default \nfork\n) to manage the process pools. You can enable the \nforkserver\n mode globally for your program by putting the following codes into your main script:\n\n\nimport multiprocessing\n\n# other imports, custom code, load data, define model...\n\nif __name__ == '__main__':\n multiprocessing.set_start_method('forkserver')\n\n # call scikit-learn utils or tpot utils with n_jobs > 1 here\n\n\n\n\nMore information about these start methods can be found in the \nmultiprocessing documentation\n.\n\n\nParallel Training with Dask\n\n\nFor large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a \nDask\n cluster.\nThe \ndask-examples binder\n has a runnable example\nwith a small dask cluster.\n\n\nTo use your Dask cluster to fit a TPOT model, specify the \nuse_dask\n keyword when you create the TPOT estimator. \nNote: if \nuse_dask=True\n, TPOT will use as many cores as available on the your Dask cluster. If \nn_jobs\n is specified, then it will control the chunk size (10*\nn_jobs\n if it is less then offspring size) of parallel training. \n\n\nestimator = TPOTEstimator(use_dask=True, n_jobs=-1)\n\n\n\n\nThis will use use all the workers on your cluster to do the training, and use \nDask-ML's pipeline rewriting\n to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the \ndistributed scheduler UI\n.\n\n\nAlternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a \njoblib.parallel_backend\n:\n\n\nfrom sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y)\n\n\n\n\nSee \ndask's distributed joblib integration\n for more.", "title": "Using TPOT" }, { @@ -52,7 +52,7 @@ }, { "location": "/using/#featuresetselector-in-tpot", - "text": "FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT. from tpot import TPOTClassifier\nimport numpy as np\nimport pandas as pd\nfrom tpot.config import classifier_config_dict\ntest_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\")\ntest_X = test_data.drop(\"class\", axis=1)\ntest_y = test_data['class']\n\n# add FeatureSetSelector into tpot configuration\nclassifier_config_dict['tpot.builtins.FeatureSetSelector'] = {\n 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],\n 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above\n #'sel_subset': list(combinations(range(3), 2)) # select two feature sets\n}\n\n\ntpot = TPOTClassifier(generations=5,\n population_size=50, verbosity=2,\n template='FeatureSetSelector-Transformer-Classifier',\n config_dict=classifier_config_dict)\ntpot.fit(test_X, test_y)", + "text": "FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier\nimport numpy as np\nimport pandas as pd\nfrom tpot.config import classifier_config_dict\ntest_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\")\ntest_X = test_data.drop(\"class\", axis=1)\ntest_y = test_data['class']\n\n# add FeatureSetSelector into tpot configuration\nclassifier_config_dict['tpot.builtins.FeatureSetSelector'] = {\n 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],\n 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above\n #'sel_subset': list(combinations(range(3), 2)) # select two feature sets\n}\n\n\ntpot = TPOTClassifier(generations=5,\n population_size=50, verbosity=2,\n template='FeatureSetSelector-Transformer-Classifier',\n config_dict=classifier_config_dict)\ntpot.fit(test_X, test_y)", "title": "FeatureSetSelector in TPOT" }, { diff --git a/docs/using/index.html b/docs/using/index.html index 3e30f6e2..7ce6e903 100644 --- a/docs/using/index.html +++ b/docs/using/index.html @@ -667,6 +667,7 @@

    Template option in TPOT

    If a specific operator, e.g. SelectPercentile, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.

    FeatureSetSelector in TPOT

    FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database (MSigDB) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT.

    +

    Please check our preprint paper for more details.

    from tpot import TPOTClassifier
     import numpy as np
     import pandas as pd
    diff --git a/docs_sources/using.md b/docs_sources/using.md
    index 24d01679..fcd465d4 100644
    --- a/docs_sources/using.md
    +++ b/docs_sources/using.md
    @@ -555,6 +555,8 @@ If a specific operator, e.g. `SelectPercentile`, is prefered to used in the 1st
     
     `FeatureSetSelector` is a special new operator in TPOT. This operator enables feature selection based on *priori* export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ([MSigDB](http://software.broadinstitute.org/gsea/msigdb/index.jsp)) in the 1st step of pipeline via `template` option above, in order to reduce dimensions and TPOT computation time. Below is a example how to use this operator in TPOT.
     
    +Please check our [preprint paper](https://www.biorxiv.org/content/10.1101/502484v1.article-info) for more details.
    +
     ```Python
     from tpot import TPOTClassifier
     import numpy as np
    
    From be0783da833efe11274aeb05d0c6d67b678d3620 Mon Sep 17 00:00:00 2001
    From: weixuanfu 
    Date: Thu, 11 Apr 2019 14:43:52 -0400
    Subject: [PATCH 60/60] remove tpot nn config
    
    ---
     tpot/config/classifier_nn.py | 220 -----------------------------------
     1 file changed, 220 deletions(-)
     delete mode 100644 tpot/config/classifier_nn.py
    
    diff --git a/tpot/config/classifier_nn.py b/tpot/config/classifier_nn.py
    deleted file mode 100644
    index 6d3e5944..00000000
    --- a/tpot/config/classifier_nn.py
    +++ /dev/null
    @@ -1,220 +0,0 @@
    -# -*- coding: utf-8 -*-
    -
    -"""This file is part of the TPOT library.
    -
    -TPOT was primarily developed at the University of Pennsylvania by:
    -    - Randal S. Olson (rso@randalolson.com)
    -    - Weixuan Fu (weixuanf@upenn.edu)
    -    - Daniel Angell (dpa34@drexel.edu)
    -    - and many more generous open source contributors
    -
    -TPOT is free software: you can redistribute it and/or modify
    -it under the terms of the GNU Lesser General Public License as
    -published by the Free Software Foundation, either version 3 of
    -the License, or (at your option) any later version.
    -
    -TPOT is distributed in the hope that it will be useful,
    -but WITHOUT ANY WARRANTY; without even the implied warranty of
    -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    -GNU Lesser General Public License for more details.
    -
    -You should have received a copy of the GNU Lesser General Public
    -License along with TPOT. If not, see .
    -
    -"""
    -
    -import numpy as np
    -
    -# Check the TPOT documentation for information on the structure of config dicts
    -
    -classifier_config_nn = {
    -    # MLPClassifier for neural networks
    -    # TODO: revisit/tweak: alpha, momentum, learning rate_init
    -    'sklearn.neural_network.MLPClassifier': {
    -        'activation': ['identity', 'logistic', 'tanh', 'relu'],
    -        'solver': ['lbfgs', 'sgd', 'adam'],
    -        'learning_rate': ['constant', 'invscaling', 'adaptive'],
    -        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
    -        'learning_rate_init': [1e-3, 1e-2, 1e-1, 0.5, 0.75, 0.9],
    -        'momentum': [0.1, 0.5, 0.75, 0.9]
    -    },
    -
    -    # Classifiers
    -    'sklearn.naive_bayes.GaussianNB': {
    -    },
    -
    -    'sklearn.naive_bayes.BernoulliNB': {
    -        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
    -        'fit_prior': [True, False]
    -    },
    -
    -    'sklearn.naive_bayes.MultinomialNB': {
    -        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
    -        'fit_prior': [True, False]
    -    },
    -
    -    'sklearn.tree.DecisionTreeClassifier': {
    -        'criterion': ["gini", "entropy"],
    -        'max_depth': range(1, 11),
    -        'min_samples_split': range(2, 21),
    -        'min_samples_leaf': range(1, 21)
    -    },
    -
    -    'sklearn.ensemble.ExtraTreesClassifier': {
    -        'n_estimators': [100],
    -        'criterion': ["gini", "entropy"],
    -        'max_features': np.arange(0.05, 1.01, 0.05),
    -        'min_samples_split': range(2, 21),
    -        'min_samples_leaf': range(1, 21),
    -        'bootstrap': [True, False]
    -    },
    -
    -    'sklearn.ensemble.RandomForestClassifier': {
    -        'n_estimators': [100],
    -        'criterion': ["gini", "entropy"],
    -        'max_features': np.arange(0.05, 1.01, 0.05),
    -        'min_samples_split': range(2, 21),
    -        'min_samples_leaf':  range(1, 21),
    -        'bootstrap': [True, False]
    -    },
    -
    -    'sklearn.ensemble.GradientBoostingClassifier': {
    -        'n_estimators': [100],
    -        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
    -        'max_depth': range(1, 11),
    -        'min_samples_split': range(2, 21),
    -        'min_samples_leaf': range(1, 21),
    -        'subsample': np.arange(0.05, 1.01, 0.05),
    -        'max_features': np.arange(0.05, 1.01, 0.05)
    -    },
    -
    -    'sklearn.neighbors.KNeighborsClassifier': {
    -        'n_neighbors': range(1, 101),
    -        'weights': ["uniform", "distance"],
    -        'p': [1, 2]
    -    },
    -
    -    'sklearn.svm.LinearSVC': {
    -        'penalty': ["l1", "l2"],
    -        'loss': ["hinge", "squared_hinge"],
    -        'dual': [True, False],
    -        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    -        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]
    -    },
    -
    -    'sklearn.linear_model.LogisticRegression': {
    -        'penalty': ["l1", "l2"],
    -        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.],
    -        'dual': [True, False]
    -    },
    -
    -    'xgboost.XGBClassifier': {
    -        'n_estimators': [100],
    -        'max_depth': range(1, 11),
    -        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
    -        'subsample': np.arange(0.05, 1.01, 0.05),
    -        'min_child_weight': range(1, 21),
    -        'nthread': [1]
    -    },
    -
    -    # Preprocesssors
    -    'sklearn.preprocessing.Binarizer': {
    -        'threshold': np.arange(0.0, 1.01, 0.05)
    -    },
    -
    -    'sklearn.decomposition.FastICA': {
    -        'tol': np.arange(0.0, 1.01, 0.05)
    -    },
    -
    -    'sklearn.cluster.FeatureAgglomeration': {
    -        'linkage': ['ward', 'complete', 'average'],
    -        'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
    -    },
    -
    -    'sklearn.preprocessing.MaxAbsScaler': {
    -    },
    -
    -    'sklearn.preprocessing.MinMaxScaler': {
    -    },
    -
    -    'sklearn.preprocessing.Normalizer': {
    -        'norm': ['l1', 'l2', 'max']
    -    },
    -
    -    'sklearn.kernel_approximation.Nystroem': {
    -        'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'],
    -        'gamma': np.arange(0.0, 1.01, 0.05),
    -        'n_components': range(1, 11)
    -    },
    -
    -    'sklearn.decomposition.PCA': {
    -        'svd_solver': ['randomized'],
    -        'iterated_power': range(1, 11)
    -    },
    -
    -    'sklearn.preprocessing.PolynomialFeatures': {
    -        'degree': [2],
    -        'include_bias': [False],
    -        'interaction_only': [False]
    -    },
    -
    -    'sklearn.kernel_approximation.RBFSampler': {
    -        'gamma': np.arange(0.0, 1.01, 0.05)
    -    },
    -
    -    'sklearn.preprocessing.RobustScaler': {
    -    },
    -
    -    'sklearn.preprocessing.StandardScaler': {
    -    },
    -
    -    'tpot.builtins.ZeroCount': {
    -    },
    -
    -    'tpot.builtins.OneHotEncoder': {
    -        'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
    -        'sparse': [False]
    -    },
    -
    -    # Selectors
    -    'sklearn.feature_selection.SelectFwe': {
    -        'alpha': np.arange(0, 0.05, 0.001),
    -        'score_func': {
    -            'sklearn.feature_selection.f_classif': None
    -        }
    -    },
    -
    -    'sklearn.feature_selection.SelectPercentile': {
    -        'percentile': range(1, 100),
    -        'score_func': {
    -            'sklearn.feature_selection.f_classif': None
    -        }
    -    },
    -
    -    'sklearn.feature_selection.VarianceThreshold': {
    -        'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
    -    },
    -
    -    'sklearn.feature_selection.RFE': {
    -        'step': np.arange(0.05, 1.01, 0.05),
    -        'estimator': {
    -            'sklearn.ensemble.ExtraTreesClassifier': {
    -                'n_estimators': [100],
    -                'criterion': ['gini', 'entropy'],
    -                'max_features': np.arange(0.05, 1.01, 0.05)
    -            }
    -        }
    -    },
    -
    -    'sklearn.feature_selection.SelectFromModel': {
    -        'threshold': np.arange(0, 1.01, 0.05),
    -        'estimator': {
    -            'sklearn.ensemble.ExtraTreesClassifier': {
    -                'n_estimators': [100],
    -                'criterion': ['gini', 'entropy'],
    -                'max_features': np.arange(0.05, 1.01, 0.05)
    -            }
    -        }
    -    }
    -
    -}